From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from lists.gentoo.org (pigeon.gentoo.org [208.92.234.80]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by finch.gentoo.org (Postfix) with ESMTPS id D04BD1382C5 for ; Fri, 20 Apr 2018 11:12:44 +0000 (UTC) Received: from pigeon.gentoo.org (localhost [127.0.0.1]) by pigeon.gentoo.org (Postfix) with SMTP id 03DEBE08F2; Fri, 20 Apr 2018 11:12:44 +0000 (UTC) Received: from smtp.gentoo.org (smtp.gentoo.org [140.211.166.183]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by pigeon.gentoo.org (Postfix) with ESMTPS id B1D0EE08F2 for ; Fri, 20 Apr 2018 11:12:43 +0000 (UTC) Received: from oystercatcher.gentoo.org (oystercatcher.gentoo.org [148.251.78.52]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.gentoo.org (Postfix) with ESMTPS id 1CF50335C09 for ; Fri, 20 Apr 2018 11:12:42 +0000 (UTC) Received: from localhost.localdomain (localhost [IPv6:::1]) by oystercatcher.gentoo.org (Postfix) with ESMTP id 02AC828D for ; Fri, 20 Apr 2018 11:12:40 +0000 (UTC) From: "Mike Pagano" To: gentoo-commits@lists.gentoo.org Content-Transfer-Encoding: 8bit Content-type: text/plain; charset=UTF-8 Reply-To: gentoo-dev@lists.gentoo.org, "Mike Pagano" Message-ID: <1524222737.51140d7f81ce4a1fc3ec7b8933345b1a1de06e51.mpagano@gentoo> Subject: [gentoo-commits] proj/linux-patches:4.9 commit in: / X-VCS-Repository: proj/linux-patches X-VCS-Files: 0000_README 1094_linux-4.9.95.patch X-VCS-Directories: / X-VCS-Committer: mpagano X-VCS-Committer-Name: Mike Pagano X-VCS-Revision: 51140d7f81ce4a1fc3ec7b8933345b1a1de06e51 X-VCS-Branch: 4.9 Date: Fri, 20 Apr 2018 11:12:40 +0000 (UTC) Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-Id: Gentoo Linux mail X-BeenThere: gentoo-commits@lists.gentoo.org X-Archives-Salt: dcc13c52-a563-4cc4-84cd-b5e9258c0131 X-Archives-Hash: 57a549cb1c782ac6afd4bd1e0f060dff commit: 51140d7f81ce4a1fc3ec7b8933345b1a1de06e51 Author: Mike Pagano gentoo org> AuthorDate: Fri Apr 20 11:12:17 2018 +0000 Commit: Mike Pagano gentoo org> CommitDate: Fri Apr 20 11:12:17 2018 +0000 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=51140d7f Linux patch 4.9.95 0000_README | 4 + 1094_linux-4.9.95.patch | 6394 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 6398 insertions(+) diff --git a/0000_README b/0000_README index cfa6743..a826f60 100644 --- a/0000_README +++ b/0000_README @@ -419,6 +419,10 @@ Patch: 1093_linux-4.9.94.patch From: http://www.kernel.org Desc: Linux 4.9.94 +Patch: 1094_linux-4.9.95.patch +From: http://www.kernel.org +Desc: Linux 4.9.95 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. diff --git a/1094_linux-4.9.95.patch b/1094_linux-4.9.95.patch new file mode 100644 index 0000000..2b1e337 --- /dev/null +++ b/1094_linux-4.9.95.patch @@ -0,0 +1,6394 @@ +diff --git a/Makefile b/Makefile +index 02188cf8e9af..1aeec9df709d 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 4 + PATCHLEVEL = 9 +-SUBLEVEL = 94 ++SUBLEVEL = 95 + EXTRAVERSION = + NAME = Roaring Lionus + +diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h +index d5423ab15ed5..9fe1043e72d2 100644 +--- a/arch/arm/include/asm/kvm_host.h ++++ b/arch/arm/include/asm/kvm_host.h +@@ -318,4 +318,10 @@ static inline int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, + return -ENXIO; + } + ++static inline bool kvm_arm_harden_branch_predictor(void) ++{ ++ /* No way to detect it yet, pretend it is not there. */ ++ return false; ++} ++ + #endif /* __ARM_KVM_HOST_H__ */ +diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h +index a58bbaa3ec60..d10e36235438 100644 +--- a/arch/arm/include/asm/kvm_mmu.h ++++ b/arch/arm/include/asm/kvm_mmu.h +@@ -223,6 +223,16 @@ static inline unsigned int kvm_get_vmid_bits(void) + return 8; + } + ++static inline void *kvm_get_hyp_vector(void) ++{ ++ return kvm_ksym_ref(__kvm_hyp_vector); ++} ++ ++static inline int kvm_map_vectors(void) ++{ ++ return 0; ++} ++ + #endif /* !__ASSEMBLY__ */ + + #endif /* __ARM_KVM_MMU_H__ */ +diff --git a/arch/arm/include/asm/kvm_psci.h b/arch/arm/include/asm/kvm_psci.h +deleted file mode 100644 +index 6bda945d31fa..000000000000 +--- a/arch/arm/include/asm/kvm_psci.h ++++ /dev/null +@@ -1,27 +0,0 @@ +-/* +- * Copyright (C) 2012 - ARM Ltd +- * Author: Marc Zyngier +- * +- * This program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License version 2 as +- * published by the Free Software Foundation. +- * +- * This program is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- * GNU General Public License for more details. +- * +- * You should have received a copy of the GNU General Public License +- * along with this program. If not, see . +- */ +- +-#ifndef __ARM_KVM_PSCI_H__ +-#define __ARM_KVM_PSCI_H__ +- +-#define KVM_ARM_PSCI_0_1 1 +-#define KVM_ARM_PSCI_0_2 2 +- +-int kvm_psci_version(struct kvm_vcpu *vcpu); +-int kvm_psci_call(struct kvm_vcpu *vcpu); +- +-#endif /* __ARM_KVM_PSCI_H__ */ +diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c +index c38bfbeec306..ef6595c7d697 100644 +--- a/arch/arm/kvm/arm.c ++++ b/arch/arm/kvm/arm.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + #define CREATE_TRACE_POINTS + #include "trace.h" +@@ -44,7 +45,6 @@ + #include + #include + #include +-#include + #include + + #ifdef REQUIRES_VIRT +@@ -1088,7 +1088,7 @@ static void cpu_init_hyp_mode(void *dummy) + pgd_ptr = kvm_mmu_get_httbr(); + stack_page = __this_cpu_read(kvm_arm_hyp_stack_page); + hyp_stack_ptr = stack_page + PAGE_SIZE; +- vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector); ++ vector_ptr = (unsigned long)kvm_get_hyp_vector(); + + __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); + __cpu_init_stage2(); +@@ -1345,6 +1345,13 @@ static int init_hyp_mode(void) + goto out_err; + } + ++ ++ err = kvm_map_vectors(); ++ if (err) { ++ kvm_err("Cannot map vectors\n"); ++ goto out_err; ++ } ++ + /* + * Map the Hyp stack pages + */ +diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c +index 4e57ebca6e69..de1aedce2a8b 100644 +--- a/arch/arm/kvm/handle_exit.c ++++ b/arch/arm/kvm/handle_exit.c +@@ -21,7 +21,7 @@ + #include + #include + #include +-#include ++#include + #include + + #include "trace.h" +@@ -36,7 +36,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) + kvm_vcpu_hvc_get_imm(vcpu)); + vcpu->stat.hvc_exit_stat++; + +- ret = kvm_psci_call(vcpu); ++ ret = kvm_hvc_call_handler(vcpu); + if (ret < 0) { + vcpu_set_reg(vcpu, 0, ~0UL); + return 1; +diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c +index a08d7a93aebb..3d962257c166 100644 +--- a/arch/arm/kvm/psci.c ++++ b/arch/arm/kvm/psci.c +@@ -15,16 +15,16 @@ + * along with this program. If not, see . + */ + ++#include + #include + #include + #include + + #include + #include +-#include + #include + +-#include ++#include + + /* + * This is an implementation of the Power State Coordination Interface +@@ -33,6 +33,38 @@ + + #define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1) + ++static u32 smccc_get_function(struct kvm_vcpu *vcpu) ++{ ++ return vcpu_get_reg(vcpu, 0); ++} ++ ++static unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu) ++{ ++ return vcpu_get_reg(vcpu, 1); ++} ++ ++static unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu) ++{ ++ return vcpu_get_reg(vcpu, 2); ++} ++ ++static unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu) ++{ ++ return vcpu_get_reg(vcpu, 3); ++} ++ ++static void smccc_set_retval(struct kvm_vcpu *vcpu, ++ unsigned long a0, ++ unsigned long a1, ++ unsigned long a2, ++ unsigned long a3) ++{ ++ vcpu_set_reg(vcpu, 0, a0); ++ vcpu_set_reg(vcpu, 1, a1); ++ vcpu_set_reg(vcpu, 2, a2); ++ vcpu_set_reg(vcpu, 3, a3); ++} ++ + static unsigned long psci_affinity_mask(unsigned long affinity_level) + { + if (affinity_level <= 3) +@@ -75,7 +107,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) + unsigned long context_id; + phys_addr_t target_pc; + +- cpu_id = vcpu_get_reg(source_vcpu, 1) & MPIDR_HWID_BITMASK; ++ cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK; + if (vcpu_mode_is_32bit(source_vcpu)) + cpu_id &= ~((u32) 0); + +@@ -88,14 +120,14 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) + if (!vcpu) + return PSCI_RET_INVALID_PARAMS; + if (!vcpu->arch.power_off) { +- if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1) ++ if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1) + return PSCI_RET_ALREADY_ON; + else + return PSCI_RET_INVALID_PARAMS; + } + +- target_pc = vcpu_get_reg(source_vcpu, 2); +- context_id = vcpu_get_reg(source_vcpu, 3); ++ target_pc = smccc_get_arg2(source_vcpu); ++ context_id = smccc_get_arg3(source_vcpu); + + kvm_reset_vcpu(vcpu); + +@@ -114,7 +146,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) + * NOTE: We always update r0 (or x0) because for PSCI v0.1 + * the general puspose registers are undefined upon CPU_ON. + */ +- vcpu_set_reg(vcpu, 0, context_id); ++ smccc_set_retval(vcpu, context_id, 0, 0, 0); + vcpu->arch.power_off = false; + smp_mb(); /* Make sure the above is visible */ + +@@ -134,8 +166,8 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *tmp; + +- target_affinity = vcpu_get_reg(vcpu, 1); +- lowest_affinity_level = vcpu_get_reg(vcpu, 2); ++ target_affinity = smccc_get_arg1(vcpu); ++ lowest_affinity_level = smccc_get_arg2(vcpu); + + /* Determine target affinity mask */ + target_affinity_mask = psci_affinity_mask(lowest_affinity_level); +@@ -198,18 +230,10 @@ static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET); + } + +-int kvm_psci_version(struct kvm_vcpu *vcpu) +-{ +- if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features)) +- return KVM_ARM_PSCI_0_2; +- +- return KVM_ARM_PSCI_0_1; +-} +- + static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) + { + struct kvm *kvm = vcpu->kvm; +- unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0); ++ unsigned long psci_fn = smccc_get_function(vcpu); + unsigned long val; + int ret = 1; + +@@ -219,7 +243,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) + * Bits[31:16] = Major Version = 0 + * Bits[15:0] = Minor Version = 2 + */ +- val = 2; ++ val = KVM_ARM_PSCI_0_2; + break; + case PSCI_0_2_FN_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_SUSPEND: +@@ -276,14 +300,56 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) + break; + } + +- vcpu_set_reg(vcpu, 0, val); ++ smccc_set_retval(vcpu, val, 0, 0, 0); ++ return ret; ++} ++ ++static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu) ++{ ++ u32 psci_fn = smccc_get_function(vcpu); ++ u32 feature; ++ unsigned long val; ++ int ret = 1; ++ ++ switch(psci_fn) { ++ case PSCI_0_2_FN_PSCI_VERSION: ++ val = KVM_ARM_PSCI_1_0; ++ break; ++ case PSCI_1_0_FN_PSCI_FEATURES: ++ feature = smccc_get_arg1(vcpu); ++ switch(feature) { ++ case PSCI_0_2_FN_PSCI_VERSION: ++ case PSCI_0_2_FN_CPU_SUSPEND: ++ case PSCI_0_2_FN64_CPU_SUSPEND: ++ case PSCI_0_2_FN_CPU_OFF: ++ case PSCI_0_2_FN_CPU_ON: ++ case PSCI_0_2_FN64_CPU_ON: ++ case PSCI_0_2_FN_AFFINITY_INFO: ++ case PSCI_0_2_FN64_AFFINITY_INFO: ++ case PSCI_0_2_FN_MIGRATE_INFO_TYPE: ++ case PSCI_0_2_FN_SYSTEM_OFF: ++ case PSCI_0_2_FN_SYSTEM_RESET: ++ case PSCI_1_0_FN_PSCI_FEATURES: ++ case ARM_SMCCC_VERSION_FUNC_ID: ++ val = 0; ++ break; ++ default: ++ val = PSCI_RET_NOT_SUPPORTED; ++ break; ++ } ++ break; ++ default: ++ return kvm_psci_0_2_call(vcpu); ++ } ++ ++ smccc_set_retval(vcpu, val, 0, 0, 0); + return ret; + } + + static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) + { + struct kvm *kvm = vcpu->kvm; +- unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0); ++ unsigned long psci_fn = smccc_get_function(vcpu); + unsigned long val; + + switch (psci_fn) { +@@ -301,7 +367,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) + break; + } + +- vcpu_set_reg(vcpu, 0, val); ++ smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; + } + +@@ -319,9 +385,11 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) + * Errors: + * -EINVAL: Unrecognized PSCI function + */ +-int kvm_psci_call(struct kvm_vcpu *vcpu) ++static int kvm_psci_call(struct kvm_vcpu *vcpu) + { +- switch (kvm_psci_version(vcpu)) { ++ switch (kvm_psci_version(vcpu, vcpu->kvm)) { ++ case KVM_ARM_PSCI_1_0: ++ return kvm_psci_1_0_call(vcpu); + case KVM_ARM_PSCI_0_2: + return kvm_psci_0_2_call(vcpu); + case KVM_ARM_PSCI_0_1: +@@ -330,3 +398,30 @@ int kvm_psci_call(struct kvm_vcpu *vcpu) + return -EINVAL; + }; + } ++ ++int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) ++{ ++ u32 func_id = smccc_get_function(vcpu); ++ u32 val = PSCI_RET_NOT_SUPPORTED; ++ u32 feature; ++ ++ switch (func_id) { ++ case ARM_SMCCC_VERSION_FUNC_ID: ++ val = ARM_SMCCC_VERSION_1_1; ++ break; ++ case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: ++ feature = smccc_get_arg1(vcpu); ++ switch(feature) { ++ case ARM_SMCCC_ARCH_WORKAROUND_1: ++ if (kvm_arm_harden_branch_predictor()) ++ val = 0; ++ break; ++ } ++ break; ++ default: ++ return kvm_psci_call(vcpu); ++ } ++ ++ smccc_set_retval(vcpu, val, 0, 0, 0); ++ return 1; ++} +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index c8471cf46cbb..90e58bbbd858 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -745,6 +745,23 @@ config UNMAP_KERNEL_AT_EL0 + + If unsure, say Y. + ++config HARDEN_BRANCH_PREDICTOR ++ bool "Harden the branch predictor against aliasing attacks" if EXPERT ++ default y ++ help ++ Speculation attacks against some high-performance processors rely on ++ being able to manipulate the branch predictor for a victim context by ++ executing aliasing branches in the attacker context. Such attacks ++ can be partially mitigated against by clearing internal branch ++ predictor state and limiting the prediction logic in some situations. ++ ++ This config option will take CPU-specific actions to harden the ++ branch predictor against aliasing attacks and may rely on specific ++ instruction sequences or control bits being set by the system ++ firmware. ++ ++ If unsure, say Y. ++ + menuconfig ARMV8_DEPRECATED + bool "Emulate deprecated/obsolete ARMv8 instructions" + depends on COMPAT +diff --git a/arch/arm64/crypto/sha256-core.S b/arch/arm64/crypto/sha256-core.S +new file mode 100644 +index 000000000000..3ce82cc860bc +--- /dev/null ++++ b/arch/arm64/crypto/sha256-core.S +@@ -0,0 +1,2061 @@ ++// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. ++// ++// Licensed under the OpenSSL license (the "License"). You may not use ++// this file except in compliance with the License. You can obtain a copy ++// in the file LICENSE in the source distribution or at ++// https://www.openssl.org/source/license.html ++ ++// ==================================================================== ++// Written by Andy Polyakov for the OpenSSL ++// project. The module is, however, dual licensed under OpenSSL and ++// CRYPTOGAMS licenses depending on where you obtain it. For further ++// details see http://www.openssl.org/~appro/cryptogams/. ++// ++// Permission to use under GPLv2 terms is granted. ++// ==================================================================== ++// ++// SHA256/512 for ARMv8. ++// ++// Performance in cycles per processed byte and improvement coefficient ++// over code generated with "default" compiler: ++// ++// SHA256-hw SHA256(*) SHA512 ++// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) ++// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) ++// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) ++// Denver 2.01 10.5 (+26%) 6.70 (+8%) ++// X-Gene 20.0 (+100%) 12.8 (+300%(***)) ++// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) ++// ++// (*) Software SHA256 results are of lesser relevance, presented ++// mostly for informational purposes. ++// (**) The result is a trade-off: it's possible to improve it by ++// 10% (or by 1 cycle per round), but at the cost of 20% loss ++// on Cortex-A53 (or by 4 cycles per round). ++// (***) Super-impressive coefficients over gcc-generated code are ++// indication of some compiler "pathology", most notably code ++// generated with -mgeneral-regs-only is significanty faster ++// and the gap is only 40-90%. ++// ++// October 2016. ++// ++// Originally it was reckoned that it makes no sense to implement NEON ++// version of SHA256 for 64-bit processors. This is because performance ++// improvement on most wide-spread Cortex-A5x processors was observed ++// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was ++// observed that 32-bit NEON SHA256 performs significantly better than ++// 64-bit scalar version on *some* of the more recent processors. As ++// result 64-bit NEON version of SHA256 was added to provide best ++// all-round performance. For example it executes ~30% faster on X-Gene ++// and Mongoose. [For reference, NEON version of SHA512 is bound to ++// deliver much less improvement, likely *negative* on Cortex-A5x. ++// Which is why NEON support is limited to SHA256.] ++ ++#ifndef __KERNEL__ ++# include "arm_arch.h" ++#endif ++ ++.text ++ ++.extern OPENSSL_armcap_P ++.globl sha256_block_data_order ++.type sha256_block_data_order,%function ++.align 6 ++sha256_block_data_order: ++#ifndef __KERNEL__ ++# ifdef __ILP32__ ++ ldrsw x16,.LOPENSSL_armcap_P ++# else ++ ldr x16,.LOPENSSL_armcap_P ++# endif ++ adr x17,.LOPENSSL_armcap_P ++ add x16,x16,x17 ++ ldr w16,[x16] ++ tst w16,#ARMV8_SHA256 ++ b.ne .Lv8_entry ++ tst w16,#ARMV7_NEON ++ b.ne .Lneon_entry ++#endif ++ stp x29,x30,[sp,#-128]! ++ add x29,sp,#0 ++ ++ stp x19,x20,[sp,#16] ++ stp x21,x22,[sp,#32] ++ stp x23,x24,[sp,#48] ++ stp x25,x26,[sp,#64] ++ stp x27,x28,[sp,#80] ++ sub sp,sp,#4*4 ++ ++ ldp w20,w21,[x0] // load context ++ ldp w22,w23,[x0,#2*4] ++ ldp w24,w25,[x0,#4*4] ++ add x2,x1,x2,lsl#6 // end of input ++ ldp w26,w27,[x0,#6*4] ++ adr x30,.LK256 ++ stp x0,x2,[x29,#96] ++ ++.Loop: ++ ldp w3,w4,[x1],#2*4 ++ ldr w19,[x30],#4 // *K++ ++ eor w28,w21,w22 // magic seed ++ str x1,[x29,#112] ++#ifndef __AARCH64EB__ ++ rev w3,w3 // 0 ++#endif ++ ror w16,w24,#6 ++ add w27,w27,w19 // h+=K[i] ++ eor w6,w24,w24,ror#14 ++ and w17,w25,w24 ++ bic w19,w26,w24 ++ add w27,w27,w3 // h+=X[i] ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w20,w21 // a^b, b^c in next round ++ eor w16,w16,w6,ror#11 // Sigma1(e) ++ ror w6,w20,#2 ++ add w27,w27,w17 // h+=Ch(e,f,g) ++ eor w17,w20,w20,ror#9 ++ add w27,w27,w16 // h+=Sigma1(e) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ add w23,w23,w27 // d+=h ++ eor w28,w28,w21 // Maj(a,b,c) ++ eor w17,w6,w17,ror#13 // Sigma0(a) ++ add w27,w27,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ //add w27,w27,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w4,w4 // 1 ++#endif ++ ldp w5,w6,[x1],#2*4 ++ add w27,w27,w17 // h+=Sigma0(a) ++ ror w16,w23,#6 ++ add w26,w26,w28 // h+=K[i] ++ eor w7,w23,w23,ror#14 ++ and w17,w24,w23 ++ bic w28,w25,w23 ++ add w26,w26,w4 // h+=X[i] ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w27,w20 // a^b, b^c in next round ++ eor w16,w16,w7,ror#11 // Sigma1(e) ++ ror w7,w27,#2 ++ add w26,w26,w17 // h+=Ch(e,f,g) ++ eor w17,w27,w27,ror#9 ++ add w26,w26,w16 // h+=Sigma1(e) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ add w22,w22,w26 // d+=h ++ eor w19,w19,w20 // Maj(a,b,c) ++ eor w17,w7,w17,ror#13 // Sigma0(a) ++ add w26,w26,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ //add w26,w26,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w5,w5 // 2 ++#endif ++ add w26,w26,w17 // h+=Sigma0(a) ++ ror w16,w22,#6 ++ add w25,w25,w19 // h+=K[i] ++ eor w8,w22,w22,ror#14 ++ and w17,w23,w22 ++ bic w19,w24,w22 ++ add w25,w25,w5 // h+=X[i] ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w26,w27 // a^b, b^c in next round ++ eor w16,w16,w8,ror#11 // Sigma1(e) ++ ror w8,w26,#2 ++ add w25,w25,w17 // h+=Ch(e,f,g) ++ eor w17,w26,w26,ror#9 ++ add w25,w25,w16 // h+=Sigma1(e) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ add w21,w21,w25 // d+=h ++ eor w28,w28,w27 // Maj(a,b,c) ++ eor w17,w8,w17,ror#13 // Sigma0(a) ++ add w25,w25,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ //add w25,w25,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w6,w6 // 3 ++#endif ++ ldp w7,w8,[x1],#2*4 ++ add w25,w25,w17 // h+=Sigma0(a) ++ ror w16,w21,#6 ++ add w24,w24,w28 // h+=K[i] ++ eor w9,w21,w21,ror#14 ++ and w17,w22,w21 ++ bic w28,w23,w21 ++ add w24,w24,w6 // h+=X[i] ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w25,w26 // a^b, b^c in next round ++ eor w16,w16,w9,ror#11 // Sigma1(e) ++ ror w9,w25,#2 ++ add w24,w24,w17 // h+=Ch(e,f,g) ++ eor w17,w25,w25,ror#9 ++ add w24,w24,w16 // h+=Sigma1(e) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ add w20,w20,w24 // d+=h ++ eor w19,w19,w26 // Maj(a,b,c) ++ eor w17,w9,w17,ror#13 // Sigma0(a) ++ add w24,w24,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ //add w24,w24,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w7,w7 // 4 ++#endif ++ add w24,w24,w17 // h+=Sigma0(a) ++ ror w16,w20,#6 ++ add w23,w23,w19 // h+=K[i] ++ eor w10,w20,w20,ror#14 ++ and w17,w21,w20 ++ bic w19,w22,w20 ++ add w23,w23,w7 // h+=X[i] ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w24,w25 // a^b, b^c in next round ++ eor w16,w16,w10,ror#11 // Sigma1(e) ++ ror w10,w24,#2 ++ add w23,w23,w17 // h+=Ch(e,f,g) ++ eor w17,w24,w24,ror#9 ++ add w23,w23,w16 // h+=Sigma1(e) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ add w27,w27,w23 // d+=h ++ eor w28,w28,w25 // Maj(a,b,c) ++ eor w17,w10,w17,ror#13 // Sigma0(a) ++ add w23,w23,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ //add w23,w23,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w8,w8 // 5 ++#endif ++ ldp w9,w10,[x1],#2*4 ++ add w23,w23,w17 // h+=Sigma0(a) ++ ror w16,w27,#6 ++ add w22,w22,w28 // h+=K[i] ++ eor w11,w27,w27,ror#14 ++ and w17,w20,w27 ++ bic w28,w21,w27 ++ add w22,w22,w8 // h+=X[i] ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w23,w24 // a^b, b^c in next round ++ eor w16,w16,w11,ror#11 // Sigma1(e) ++ ror w11,w23,#2 ++ add w22,w22,w17 // h+=Ch(e,f,g) ++ eor w17,w23,w23,ror#9 ++ add w22,w22,w16 // h+=Sigma1(e) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ add w26,w26,w22 // d+=h ++ eor w19,w19,w24 // Maj(a,b,c) ++ eor w17,w11,w17,ror#13 // Sigma0(a) ++ add w22,w22,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ //add w22,w22,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w9,w9 // 6 ++#endif ++ add w22,w22,w17 // h+=Sigma0(a) ++ ror w16,w26,#6 ++ add w21,w21,w19 // h+=K[i] ++ eor w12,w26,w26,ror#14 ++ and w17,w27,w26 ++ bic w19,w20,w26 ++ add w21,w21,w9 // h+=X[i] ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w22,w23 // a^b, b^c in next round ++ eor w16,w16,w12,ror#11 // Sigma1(e) ++ ror w12,w22,#2 ++ add w21,w21,w17 // h+=Ch(e,f,g) ++ eor w17,w22,w22,ror#9 ++ add w21,w21,w16 // h+=Sigma1(e) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ add w25,w25,w21 // d+=h ++ eor w28,w28,w23 // Maj(a,b,c) ++ eor w17,w12,w17,ror#13 // Sigma0(a) ++ add w21,w21,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ //add w21,w21,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w10,w10 // 7 ++#endif ++ ldp w11,w12,[x1],#2*4 ++ add w21,w21,w17 // h+=Sigma0(a) ++ ror w16,w25,#6 ++ add w20,w20,w28 // h+=K[i] ++ eor w13,w25,w25,ror#14 ++ and w17,w26,w25 ++ bic w28,w27,w25 ++ add w20,w20,w10 // h+=X[i] ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w21,w22 // a^b, b^c in next round ++ eor w16,w16,w13,ror#11 // Sigma1(e) ++ ror w13,w21,#2 ++ add w20,w20,w17 // h+=Ch(e,f,g) ++ eor w17,w21,w21,ror#9 ++ add w20,w20,w16 // h+=Sigma1(e) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ add w24,w24,w20 // d+=h ++ eor w19,w19,w22 // Maj(a,b,c) ++ eor w17,w13,w17,ror#13 // Sigma0(a) ++ add w20,w20,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ //add w20,w20,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w11,w11 // 8 ++#endif ++ add w20,w20,w17 // h+=Sigma0(a) ++ ror w16,w24,#6 ++ add w27,w27,w19 // h+=K[i] ++ eor w14,w24,w24,ror#14 ++ and w17,w25,w24 ++ bic w19,w26,w24 ++ add w27,w27,w11 // h+=X[i] ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w20,w21 // a^b, b^c in next round ++ eor w16,w16,w14,ror#11 // Sigma1(e) ++ ror w14,w20,#2 ++ add w27,w27,w17 // h+=Ch(e,f,g) ++ eor w17,w20,w20,ror#9 ++ add w27,w27,w16 // h+=Sigma1(e) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ add w23,w23,w27 // d+=h ++ eor w28,w28,w21 // Maj(a,b,c) ++ eor w17,w14,w17,ror#13 // Sigma0(a) ++ add w27,w27,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ //add w27,w27,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w12,w12 // 9 ++#endif ++ ldp w13,w14,[x1],#2*4 ++ add w27,w27,w17 // h+=Sigma0(a) ++ ror w16,w23,#6 ++ add w26,w26,w28 // h+=K[i] ++ eor w15,w23,w23,ror#14 ++ and w17,w24,w23 ++ bic w28,w25,w23 ++ add w26,w26,w12 // h+=X[i] ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w27,w20 // a^b, b^c in next round ++ eor w16,w16,w15,ror#11 // Sigma1(e) ++ ror w15,w27,#2 ++ add w26,w26,w17 // h+=Ch(e,f,g) ++ eor w17,w27,w27,ror#9 ++ add w26,w26,w16 // h+=Sigma1(e) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ add w22,w22,w26 // d+=h ++ eor w19,w19,w20 // Maj(a,b,c) ++ eor w17,w15,w17,ror#13 // Sigma0(a) ++ add w26,w26,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ //add w26,w26,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w13,w13 // 10 ++#endif ++ add w26,w26,w17 // h+=Sigma0(a) ++ ror w16,w22,#6 ++ add w25,w25,w19 // h+=K[i] ++ eor w0,w22,w22,ror#14 ++ and w17,w23,w22 ++ bic w19,w24,w22 ++ add w25,w25,w13 // h+=X[i] ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w26,w27 // a^b, b^c in next round ++ eor w16,w16,w0,ror#11 // Sigma1(e) ++ ror w0,w26,#2 ++ add w25,w25,w17 // h+=Ch(e,f,g) ++ eor w17,w26,w26,ror#9 ++ add w25,w25,w16 // h+=Sigma1(e) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ add w21,w21,w25 // d+=h ++ eor w28,w28,w27 // Maj(a,b,c) ++ eor w17,w0,w17,ror#13 // Sigma0(a) ++ add w25,w25,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ //add w25,w25,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w14,w14 // 11 ++#endif ++ ldp w15,w0,[x1],#2*4 ++ add w25,w25,w17 // h+=Sigma0(a) ++ str w6,[sp,#12] ++ ror w16,w21,#6 ++ add w24,w24,w28 // h+=K[i] ++ eor w6,w21,w21,ror#14 ++ and w17,w22,w21 ++ bic w28,w23,w21 ++ add w24,w24,w14 // h+=X[i] ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w25,w26 // a^b, b^c in next round ++ eor w16,w16,w6,ror#11 // Sigma1(e) ++ ror w6,w25,#2 ++ add w24,w24,w17 // h+=Ch(e,f,g) ++ eor w17,w25,w25,ror#9 ++ add w24,w24,w16 // h+=Sigma1(e) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ add w20,w20,w24 // d+=h ++ eor w19,w19,w26 // Maj(a,b,c) ++ eor w17,w6,w17,ror#13 // Sigma0(a) ++ add w24,w24,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ //add w24,w24,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w15,w15 // 12 ++#endif ++ add w24,w24,w17 // h+=Sigma0(a) ++ str w7,[sp,#0] ++ ror w16,w20,#6 ++ add w23,w23,w19 // h+=K[i] ++ eor w7,w20,w20,ror#14 ++ and w17,w21,w20 ++ bic w19,w22,w20 ++ add w23,w23,w15 // h+=X[i] ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w24,w25 // a^b, b^c in next round ++ eor w16,w16,w7,ror#11 // Sigma1(e) ++ ror w7,w24,#2 ++ add w23,w23,w17 // h+=Ch(e,f,g) ++ eor w17,w24,w24,ror#9 ++ add w23,w23,w16 // h+=Sigma1(e) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ add w27,w27,w23 // d+=h ++ eor w28,w28,w25 // Maj(a,b,c) ++ eor w17,w7,w17,ror#13 // Sigma0(a) ++ add w23,w23,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ //add w23,w23,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w0,w0 // 13 ++#endif ++ ldp w1,w2,[x1] ++ add w23,w23,w17 // h+=Sigma0(a) ++ str w8,[sp,#4] ++ ror w16,w27,#6 ++ add w22,w22,w28 // h+=K[i] ++ eor w8,w27,w27,ror#14 ++ and w17,w20,w27 ++ bic w28,w21,w27 ++ add w22,w22,w0 // h+=X[i] ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w23,w24 // a^b, b^c in next round ++ eor w16,w16,w8,ror#11 // Sigma1(e) ++ ror w8,w23,#2 ++ add w22,w22,w17 // h+=Ch(e,f,g) ++ eor w17,w23,w23,ror#9 ++ add w22,w22,w16 // h+=Sigma1(e) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ add w26,w26,w22 // d+=h ++ eor w19,w19,w24 // Maj(a,b,c) ++ eor w17,w8,w17,ror#13 // Sigma0(a) ++ add w22,w22,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ //add w22,w22,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w1,w1 // 14 ++#endif ++ ldr w6,[sp,#12] ++ add w22,w22,w17 // h+=Sigma0(a) ++ str w9,[sp,#8] ++ ror w16,w26,#6 ++ add w21,w21,w19 // h+=K[i] ++ eor w9,w26,w26,ror#14 ++ and w17,w27,w26 ++ bic w19,w20,w26 ++ add w21,w21,w1 // h+=X[i] ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w22,w23 // a^b, b^c in next round ++ eor w16,w16,w9,ror#11 // Sigma1(e) ++ ror w9,w22,#2 ++ add w21,w21,w17 // h+=Ch(e,f,g) ++ eor w17,w22,w22,ror#9 ++ add w21,w21,w16 // h+=Sigma1(e) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ add w25,w25,w21 // d+=h ++ eor w28,w28,w23 // Maj(a,b,c) ++ eor w17,w9,w17,ror#13 // Sigma0(a) ++ add w21,w21,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ //add w21,w21,w17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev w2,w2 // 15 ++#endif ++ ldr w7,[sp,#0] ++ add w21,w21,w17 // h+=Sigma0(a) ++ str w10,[sp,#12] ++ ror w16,w25,#6 ++ add w20,w20,w28 // h+=K[i] ++ ror w9,w4,#7 ++ and w17,w26,w25 ++ ror w8,w1,#17 ++ bic w28,w27,w25 ++ ror w10,w21,#2 ++ add w20,w20,w2 // h+=X[i] ++ eor w16,w16,w25,ror#11 ++ eor w9,w9,w4,ror#18 ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w21,w22 // a^b, b^c in next round ++ eor w16,w16,w25,ror#25 // Sigma1(e) ++ eor w10,w10,w21,ror#13 ++ add w20,w20,w17 // h+=Ch(e,f,g) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ eor w8,w8,w1,ror#19 ++ eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) ++ add w20,w20,w16 // h+=Sigma1(e) ++ eor w19,w19,w22 // Maj(a,b,c) ++ eor w17,w10,w21,ror#22 // Sigma0(a) ++ eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) ++ add w3,w3,w12 ++ add w24,w24,w20 // d+=h ++ add w20,w20,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ add w3,w3,w9 ++ add w20,w20,w17 // h+=Sigma0(a) ++ add w3,w3,w8 ++.Loop_16_xx: ++ ldr w8,[sp,#4] ++ str w11,[sp,#0] ++ ror w16,w24,#6 ++ add w27,w27,w19 // h+=K[i] ++ ror w10,w5,#7 ++ and w17,w25,w24 ++ ror w9,w2,#17 ++ bic w19,w26,w24 ++ ror w11,w20,#2 ++ add w27,w27,w3 // h+=X[i] ++ eor w16,w16,w24,ror#11 ++ eor w10,w10,w5,ror#18 ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w20,w21 // a^b, b^c in next round ++ eor w16,w16,w24,ror#25 // Sigma1(e) ++ eor w11,w11,w20,ror#13 ++ add w27,w27,w17 // h+=Ch(e,f,g) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ eor w9,w9,w2,ror#19 ++ eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) ++ add w27,w27,w16 // h+=Sigma1(e) ++ eor w28,w28,w21 // Maj(a,b,c) ++ eor w17,w11,w20,ror#22 // Sigma0(a) ++ eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) ++ add w4,w4,w13 ++ add w23,w23,w27 // d+=h ++ add w27,w27,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ add w4,w4,w10 ++ add w27,w27,w17 // h+=Sigma0(a) ++ add w4,w4,w9 ++ ldr w9,[sp,#8] ++ str w12,[sp,#4] ++ ror w16,w23,#6 ++ add w26,w26,w28 // h+=K[i] ++ ror w11,w6,#7 ++ and w17,w24,w23 ++ ror w10,w3,#17 ++ bic w28,w25,w23 ++ ror w12,w27,#2 ++ add w26,w26,w4 // h+=X[i] ++ eor w16,w16,w23,ror#11 ++ eor w11,w11,w6,ror#18 ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w27,w20 // a^b, b^c in next round ++ eor w16,w16,w23,ror#25 // Sigma1(e) ++ eor w12,w12,w27,ror#13 ++ add w26,w26,w17 // h+=Ch(e,f,g) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ eor w10,w10,w3,ror#19 ++ eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) ++ add w26,w26,w16 // h+=Sigma1(e) ++ eor w19,w19,w20 // Maj(a,b,c) ++ eor w17,w12,w27,ror#22 // Sigma0(a) ++ eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) ++ add w5,w5,w14 ++ add w22,w22,w26 // d+=h ++ add w26,w26,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ add w5,w5,w11 ++ add w26,w26,w17 // h+=Sigma0(a) ++ add w5,w5,w10 ++ ldr w10,[sp,#12] ++ str w13,[sp,#8] ++ ror w16,w22,#6 ++ add w25,w25,w19 // h+=K[i] ++ ror w12,w7,#7 ++ and w17,w23,w22 ++ ror w11,w4,#17 ++ bic w19,w24,w22 ++ ror w13,w26,#2 ++ add w25,w25,w5 // h+=X[i] ++ eor w16,w16,w22,ror#11 ++ eor w12,w12,w7,ror#18 ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w26,w27 // a^b, b^c in next round ++ eor w16,w16,w22,ror#25 // Sigma1(e) ++ eor w13,w13,w26,ror#13 ++ add w25,w25,w17 // h+=Ch(e,f,g) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ eor w11,w11,w4,ror#19 ++ eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) ++ add w25,w25,w16 // h+=Sigma1(e) ++ eor w28,w28,w27 // Maj(a,b,c) ++ eor w17,w13,w26,ror#22 // Sigma0(a) ++ eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) ++ add w6,w6,w15 ++ add w21,w21,w25 // d+=h ++ add w25,w25,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ add w6,w6,w12 ++ add w25,w25,w17 // h+=Sigma0(a) ++ add w6,w6,w11 ++ ldr w11,[sp,#0] ++ str w14,[sp,#12] ++ ror w16,w21,#6 ++ add w24,w24,w28 // h+=K[i] ++ ror w13,w8,#7 ++ and w17,w22,w21 ++ ror w12,w5,#17 ++ bic w28,w23,w21 ++ ror w14,w25,#2 ++ add w24,w24,w6 // h+=X[i] ++ eor w16,w16,w21,ror#11 ++ eor w13,w13,w8,ror#18 ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w25,w26 // a^b, b^c in next round ++ eor w16,w16,w21,ror#25 // Sigma1(e) ++ eor w14,w14,w25,ror#13 ++ add w24,w24,w17 // h+=Ch(e,f,g) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ eor w12,w12,w5,ror#19 ++ eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) ++ add w24,w24,w16 // h+=Sigma1(e) ++ eor w19,w19,w26 // Maj(a,b,c) ++ eor w17,w14,w25,ror#22 // Sigma0(a) ++ eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) ++ add w7,w7,w0 ++ add w20,w20,w24 // d+=h ++ add w24,w24,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ add w7,w7,w13 ++ add w24,w24,w17 // h+=Sigma0(a) ++ add w7,w7,w12 ++ ldr w12,[sp,#4] ++ str w15,[sp,#0] ++ ror w16,w20,#6 ++ add w23,w23,w19 // h+=K[i] ++ ror w14,w9,#7 ++ and w17,w21,w20 ++ ror w13,w6,#17 ++ bic w19,w22,w20 ++ ror w15,w24,#2 ++ add w23,w23,w7 // h+=X[i] ++ eor w16,w16,w20,ror#11 ++ eor w14,w14,w9,ror#18 ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w24,w25 // a^b, b^c in next round ++ eor w16,w16,w20,ror#25 // Sigma1(e) ++ eor w15,w15,w24,ror#13 ++ add w23,w23,w17 // h+=Ch(e,f,g) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ eor w13,w13,w6,ror#19 ++ eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) ++ add w23,w23,w16 // h+=Sigma1(e) ++ eor w28,w28,w25 // Maj(a,b,c) ++ eor w17,w15,w24,ror#22 // Sigma0(a) ++ eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) ++ add w8,w8,w1 ++ add w27,w27,w23 // d+=h ++ add w23,w23,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ add w8,w8,w14 ++ add w23,w23,w17 // h+=Sigma0(a) ++ add w8,w8,w13 ++ ldr w13,[sp,#8] ++ str w0,[sp,#4] ++ ror w16,w27,#6 ++ add w22,w22,w28 // h+=K[i] ++ ror w15,w10,#7 ++ and w17,w20,w27 ++ ror w14,w7,#17 ++ bic w28,w21,w27 ++ ror w0,w23,#2 ++ add w22,w22,w8 // h+=X[i] ++ eor w16,w16,w27,ror#11 ++ eor w15,w15,w10,ror#18 ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w23,w24 // a^b, b^c in next round ++ eor w16,w16,w27,ror#25 // Sigma1(e) ++ eor w0,w0,w23,ror#13 ++ add w22,w22,w17 // h+=Ch(e,f,g) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ eor w14,w14,w7,ror#19 ++ eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) ++ add w22,w22,w16 // h+=Sigma1(e) ++ eor w19,w19,w24 // Maj(a,b,c) ++ eor w17,w0,w23,ror#22 // Sigma0(a) ++ eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) ++ add w9,w9,w2 ++ add w26,w26,w22 // d+=h ++ add w22,w22,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ add w9,w9,w15 ++ add w22,w22,w17 // h+=Sigma0(a) ++ add w9,w9,w14 ++ ldr w14,[sp,#12] ++ str w1,[sp,#8] ++ ror w16,w26,#6 ++ add w21,w21,w19 // h+=K[i] ++ ror w0,w11,#7 ++ and w17,w27,w26 ++ ror w15,w8,#17 ++ bic w19,w20,w26 ++ ror w1,w22,#2 ++ add w21,w21,w9 // h+=X[i] ++ eor w16,w16,w26,ror#11 ++ eor w0,w0,w11,ror#18 ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w22,w23 // a^b, b^c in next round ++ eor w16,w16,w26,ror#25 // Sigma1(e) ++ eor w1,w1,w22,ror#13 ++ add w21,w21,w17 // h+=Ch(e,f,g) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ eor w15,w15,w8,ror#19 ++ eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) ++ add w21,w21,w16 // h+=Sigma1(e) ++ eor w28,w28,w23 // Maj(a,b,c) ++ eor w17,w1,w22,ror#22 // Sigma0(a) ++ eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) ++ add w10,w10,w3 ++ add w25,w25,w21 // d+=h ++ add w21,w21,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ add w10,w10,w0 ++ add w21,w21,w17 // h+=Sigma0(a) ++ add w10,w10,w15 ++ ldr w15,[sp,#0] ++ str w2,[sp,#12] ++ ror w16,w25,#6 ++ add w20,w20,w28 // h+=K[i] ++ ror w1,w12,#7 ++ and w17,w26,w25 ++ ror w0,w9,#17 ++ bic w28,w27,w25 ++ ror w2,w21,#2 ++ add w20,w20,w10 // h+=X[i] ++ eor w16,w16,w25,ror#11 ++ eor w1,w1,w12,ror#18 ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w21,w22 // a^b, b^c in next round ++ eor w16,w16,w25,ror#25 // Sigma1(e) ++ eor w2,w2,w21,ror#13 ++ add w20,w20,w17 // h+=Ch(e,f,g) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ eor w0,w0,w9,ror#19 ++ eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) ++ add w20,w20,w16 // h+=Sigma1(e) ++ eor w19,w19,w22 // Maj(a,b,c) ++ eor w17,w2,w21,ror#22 // Sigma0(a) ++ eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) ++ add w11,w11,w4 ++ add w24,w24,w20 // d+=h ++ add w20,w20,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ add w11,w11,w1 ++ add w20,w20,w17 // h+=Sigma0(a) ++ add w11,w11,w0 ++ ldr w0,[sp,#4] ++ str w3,[sp,#0] ++ ror w16,w24,#6 ++ add w27,w27,w19 // h+=K[i] ++ ror w2,w13,#7 ++ and w17,w25,w24 ++ ror w1,w10,#17 ++ bic w19,w26,w24 ++ ror w3,w20,#2 ++ add w27,w27,w11 // h+=X[i] ++ eor w16,w16,w24,ror#11 ++ eor w2,w2,w13,ror#18 ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w20,w21 // a^b, b^c in next round ++ eor w16,w16,w24,ror#25 // Sigma1(e) ++ eor w3,w3,w20,ror#13 ++ add w27,w27,w17 // h+=Ch(e,f,g) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ eor w1,w1,w10,ror#19 ++ eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) ++ add w27,w27,w16 // h+=Sigma1(e) ++ eor w28,w28,w21 // Maj(a,b,c) ++ eor w17,w3,w20,ror#22 // Sigma0(a) ++ eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) ++ add w12,w12,w5 ++ add w23,w23,w27 // d+=h ++ add w27,w27,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ add w12,w12,w2 ++ add w27,w27,w17 // h+=Sigma0(a) ++ add w12,w12,w1 ++ ldr w1,[sp,#8] ++ str w4,[sp,#4] ++ ror w16,w23,#6 ++ add w26,w26,w28 // h+=K[i] ++ ror w3,w14,#7 ++ and w17,w24,w23 ++ ror w2,w11,#17 ++ bic w28,w25,w23 ++ ror w4,w27,#2 ++ add w26,w26,w12 // h+=X[i] ++ eor w16,w16,w23,ror#11 ++ eor w3,w3,w14,ror#18 ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w27,w20 // a^b, b^c in next round ++ eor w16,w16,w23,ror#25 // Sigma1(e) ++ eor w4,w4,w27,ror#13 ++ add w26,w26,w17 // h+=Ch(e,f,g) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ eor w2,w2,w11,ror#19 ++ eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) ++ add w26,w26,w16 // h+=Sigma1(e) ++ eor w19,w19,w20 // Maj(a,b,c) ++ eor w17,w4,w27,ror#22 // Sigma0(a) ++ eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) ++ add w13,w13,w6 ++ add w22,w22,w26 // d+=h ++ add w26,w26,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ add w13,w13,w3 ++ add w26,w26,w17 // h+=Sigma0(a) ++ add w13,w13,w2 ++ ldr w2,[sp,#12] ++ str w5,[sp,#8] ++ ror w16,w22,#6 ++ add w25,w25,w19 // h+=K[i] ++ ror w4,w15,#7 ++ and w17,w23,w22 ++ ror w3,w12,#17 ++ bic w19,w24,w22 ++ ror w5,w26,#2 ++ add w25,w25,w13 // h+=X[i] ++ eor w16,w16,w22,ror#11 ++ eor w4,w4,w15,ror#18 ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w26,w27 // a^b, b^c in next round ++ eor w16,w16,w22,ror#25 // Sigma1(e) ++ eor w5,w5,w26,ror#13 ++ add w25,w25,w17 // h+=Ch(e,f,g) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ eor w3,w3,w12,ror#19 ++ eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) ++ add w25,w25,w16 // h+=Sigma1(e) ++ eor w28,w28,w27 // Maj(a,b,c) ++ eor w17,w5,w26,ror#22 // Sigma0(a) ++ eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) ++ add w14,w14,w7 ++ add w21,w21,w25 // d+=h ++ add w25,w25,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ add w14,w14,w4 ++ add w25,w25,w17 // h+=Sigma0(a) ++ add w14,w14,w3 ++ ldr w3,[sp,#0] ++ str w6,[sp,#12] ++ ror w16,w21,#6 ++ add w24,w24,w28 // h+=K[i] ++ ror w5,w0,#7 ++ and w17,w22,w21 ++ ror w4,w13,#17 ++ bic w28,w23,w21 ++ ror w6,w25,#2 ++ add w24,w24,w14 // h+=X[i] ++ eor w16,w16,w21,ror#11 ++ eor w5,w5,w0,ror#18 ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w25,w26 // a^b, b^c in next round ++ eor w16,w16,w21,ror#25 // Sigma1(e) ++ eor w6,w6,w25,ror#13 ++ add w24,w24,w17 // h+=Ch(e,f,g) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ eor w4,w4,w13,ror#19 ++ eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) ++ add w24,w24,w16 // h+=Sigma1(e) ++ eor w19,w19,w26 // Maj(a,b,c) ++ eor w17,w6,w25,ror#22 // Sigma0(a) ++ eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) ++ add w15,w15,w8 ++ add w20,w20,w24 // d+=h ++ add w24,w24,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ add w15,w15,w5 ++ add w24,w24,w17 // h+=Sigma0(a) ++ add w15,w15,w4 ++ ldr w4,[sp,#4] ++ str w7,[sp,#0] ++ ror w16,w20,#6 ++ add w23,w23,w19 // h+=K[i] ++ ror w6,w1,#7 ++ and w17,w21,w20 ++ ror w5,w14,#17 ++ bic w19,w22,w20 ++ ror w7,w24,#2 ++ add w23,w23,w15 // h+=X[i] ++ eor w16,w16,w20,ror#11 ++ eor w6,w6,w1,ror#18 ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w24,w25 // a^b, b^c in next round ++ eor w16,w16,w20,ror#25 // Sigma1(e) ++ eor w7,w7,w24,ror#13 ++ add w23,w23,w17 // h+=Ch(e,f,g) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ eor w5,w5,w14,ror#19 ++ eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) ++ add w23,w23,w16 // h+=Sigma1(e) ++ eor w28,w28,w25 // Maj(a,b,c) ++ eor w17,w7,w24,ror#22 // Sigma0(a) ++ eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) ++ add w0,w0,w9 ++ add w27,w27,w23 // d+=h ++ add w23,w23,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ add w0,w0,w6 ++ add w23,w23,w17 // h+=Sigma0(a) ++ add w0,w0,w5 ++ ldr w5,[sp,#8] ++ str w8,[sp,#4] ++ ror w16,w27,#6 ++ add w22,w22,w28 // h+=K[i] ++ ror w7,w2,#7 ++ and w17,w20,w27 ++ ror w6,w15,#17 ++ bic w28,w21,w27 ++ ror w8,w23,#2 ++ add w22,w22,w0 // h+=X[i] ++ eor w16,w16,w27,ror#11 ++ eor w7,w7,w2,ror#18 ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w23,w24 // a^b, b^c in next round ++ eor w16,w16,w27,ror#25 // Sigma1(e) ++ eor w8,w8,w23,ror#13 ++ add w22,w22,w17 // h+=Ch(e,f,g) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ eor w6,w6,w15,ror#19 ++ eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) ++ add w22,w22,w16 // h+=Sigma1(e) ++ eor w19,w19,w24 // Maj(a,b,c) ++ eor w17,w8,w23,ror#22 // Sigma0(a) ++ eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) ++ add w1,w1,w10 ++ add w26,w26,w22 // d+=h ++ add w22,w22,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ add w1,w1,w7 ++ add w22,w22,w17 // h+=Sigma0(a) ++ add w1,w1,w6 ++ ldr w6,[sp,#12] ++ str w9,[sp,#8] ++ ror w16,w26,#6 ++ add w21,w21,w19 // h+=K[i] ++ ror w8,w3,#7 ++ and w17,w27,w26 ++ ror w7,w0,#17 ++ bic w19,w20,w26 ++ ror w9,w22,#2 ++ add w21,w21,w1 // h+=X[i] ++ eor w16,w16,w26,ror#11 ++ eor w8,w8,w3,ror#18 ++ orr w17,w17,w19 // Ch(e,f,g) ++ eor w19,w22,w23 // a^b, b^c in next round ++ eor w16,w16,w26,ror#25 // Sigma1(e) ++ eor w9,w9,w22,ror#13 ++ add w21,w21,w17 // h+=Ch(e,f,g) ++ and w28,w28,w19 // (b^c)&=(a^b) ++ eor w7,w7,w0,ror#19 ++ eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) ++ add w21,w21,w16 // h+=Sigma1(e) ++ eor w28,w28,w23 // Maj(a,b,c) ++ eor w17,w9,w22,ror#22 // Sigma0(a) ++ eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) ++ add w2,w2,w11 ++ add w25,w25,w21 // d+=h ++ add w21,w21,w28 // h+=Maj(a,b,c) ++ ldr w28,[x30],#4 // *K++, w19 in next round ++ add w2,w2,w8 ++ add w21,w21,w17 // h+=Sigma0(a) ++ add w2,w2,w7 ++ ldr w7,[sp,#0] ++ str w10,[sp,#12] ++ ror w16,w25,#6 ++ add w20,w20,w28 // h+=K[i] ++ ror w9,w4,#7 ++ and w17,w26,w25 ++ ror w8,w1,#17 ++ bic w28,w27,w25 ++ ror w10,w21,#2 ++ add w20,w20,w2 // h+=X[i] ++ eor w16,w16,w25,ror#11 ++ eor w9,w9,w4,ror#18 ++ orr w17,w17,w28 // Ch(e,f,g) ++ eor w28,w21,w22 // a^b, b^c in next round ++ eor w16,w16,w25,ror#25 // Sigma1(e) ++ eor w10,w10,w21,ror#13 ++ add w20,w20,w17 // h+=Ch(e,f,g) ++ and w19,w19,w28 // (b^c)&=(a^b) ++ eor w8,w8,w1,ror#19 ++ eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) ++ add w20,w20,w16 // h+=Sigma1(e) ++ eor w19,w19,w22 // Maj(a,b,c) ++ eor w17,w10,w21,ror#22 // Sigma0(a) ++ eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) ++ add w3,w3,w12 ++ add w24,w24,w20 // d+=h ++ add w20,w20,w19 // h+=Maj(a,b,c) ++ ldr w19,[x30],#4 // *K++, w28 in next round ++ add w3,w3,w9 ++ add w20,w20,w17 // h+=Sigma0(a) ++ add w3,w3,w8 ++ cbnz w19,.Loop_16_xx ++ ++ ldp x0,x2,[x29,#96] ++ ldr x1,[x29,#112] ++ sub x30,x30,#260 // rewind ++ ++ ldp w3,w4,[x0] ++ ldp w5,w6,[x0,#2*4] ++ add x1,x1,#14*4 // advance input pointer ++ ldp w7,w8,[x0,#4*4] ++ add w20,w20,w3 ++ ldp w9,w10,[x0,#6*4] ++ add w21,w21,w4 ++ add w22,w22,w5 ++ add w23,w23,w6 ++ stp w20,w21,[x0] ++ add w24,w24,w7 ++ add w25,w25,w8 ++ stp w22,w23,[x0,#2*4] ++ add w26,w26,w9 ++ add w27,w27,w10 ++ cmp x1,x2 ++ stp w24,w25,[x0,#4*4] ++ stp w26,w27,[x0,#6*4] ++ b.ne .Loop ++ ++ ldp x19,x20,[x29,#16] ++ add sp,sp,#4*4 ++ ldp x21,x22,[x29,#32] ++ ldp x23,x24,[x29,#48] ++ ldp x25,x26,[x29,#64] ++ ldp x27,x28,[x29,#80] ++ ldp x29,x30,[sp],#128 ++ ret ++.size sha256_block_data_order,.-sha256_block_data_order ++ ++.align 6 ++.type .LK256,%object ++.LK256: ++ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 ++ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 ++ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 ++ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 ++ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc ++ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da ++ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 ++ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 ++ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 ++ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 ++ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 ++ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 ++ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 ++ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 ++ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 ++ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ++ .long 0 //terminator ++.size .LK256,.-.LK256 ++#ifndef __KERNEL__ ++.align 3 ++.LOPENSSL_armcap_P: ++# ifdef __ILP32__ ++ .long OPENSSL_armcap_P-. ++# else ++ .quad OPENSSL_armcap_P-. ++# endif ++#endif ++.asciz "SHA256 block transform for ARMv8, CRYPTOGAMS by " ++.align 2 ++#ifndef __KERNEL__ ++.type sha256_block_armv8,%function ++.align 6 ++sha256_block_armv8: ++.Lv8_entry: ++ stp x29,x30,[sp,#-16]! ++ add x29,sp,#0 ++ ++ ld1 {v0.4s,v1.4s},[x0] ++ adr x3,.LK256 ++ ++.Loop_hw: ++ ld1 {v4.16b-v7.16b},[x1],#64 ++ sub x2,x2,#1 ++ ld1 {v16.4s},[x3],#16 ++ rev32 v4.16b,v4.16b ++ rev32 v5.16b,v5.16b ++ rev32 v6.16b,v6.16b ++ rev32 v7.16b,v7.16b ++ orr v18.16b,v0.16b,v0.16b // offload ++ orr v19.16b,v1.16b,v1.16b ++ ld1 {v17.4s},[x3],#16 ++ add v16.4s,v16.4s,v4.4s ++ .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s ++ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ++ .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ++ ld1 {v16.4s},[x3],#16 ++ add v17.4s,v17.4s,v5.4s ++ .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s ++ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ++ .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ++ ld1 {v17.4s},[x3],#16 ++ add v16.4s,v16.4s,v6.4s ++ .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s ++ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ++ .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ++ ld1 {v16.4s},[x3],#16 ++ add v17.4s,v17.4s,v7.4s ++ .inst 0x5e282887 //sha256su0 v7.16b,v4.16b ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s ++ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ++ .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ++ ld1 {v17.4s},[x3],#16 ++ add v16.4s,v16.4s,v4.4s ++ .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s ++ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ++ .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ++ ld1 {v16.4s},[x3],#16 ++ add v17.4s,v17.4s,v5.4s ++ .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s ++ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ++ .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ++ ld1 {v17.4s},[x3],#16 ++ add v16.4s,v16.4s,v6.4s ++ .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s ++ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ++ .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ++ ld1 {v16.4s},[x3],#16 ++ add v17.4s,v17.4s,v7.4s ++ .inst 0x5e282887 //sha256su0 v7.16b,v4.16b ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s ++ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ++ .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ++ ld1 {v17.4s},[x3],#16 ++ add v16.4s,v16.4s,v4.4s ++ .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s ++ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ++ .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b ++ ld1 {v16.4s},[x3],#16 ++ add v17.4s,v17.4s,v5.4s ++ .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s ++ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ++ .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b ++ ld1 {v17.4s},[x3],#16 ++ add v16.4s,v16.4s,v6.4s ++ .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s ++ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ++ .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b ++ ld1 {v16.4s},[x3],#16 ++ add v17.4s,v17.4s,v7.4s ++ .inst 0x5e282887 //sha256su0 v7.16b,v4.16b ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s ++ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ++ .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b ++ ld1 {v17.4s},[x3],#16 ++ add v16.4s,v16.4s,v4.4s ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s ++ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ++ ++ ld1 {v16.4s},[x3],#16 ++ add v17.4s,v17.4s,v5.4s ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s ++ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ++ ++ ld1 {v17.4s},[x3] ++ add v16.4s,v16.4s,v6.4s ++ sub x3,x3,#64*4-16 // rewind ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s ++ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s ++ ++ add v17.4s,v17.4s,v7.4s ++ orr v2.16b,v0.16b,v0.16b ++ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s ++ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s ++ ++ add v0.4s,v0.4s,v18.4s ++ add v1.4s,v1.4s,v19.4s ++ ++ cbnz x2,.Loop_hw ++ ++ st1 {v0.4s,v1.4s},[x0] ++ ++ ldr x29,[sp],#16 ++ ret ++.size sha256_block_armv8,.-sha256_block_armv8 ++#endif ++#ifdef __KERNEL__ ++.globl sha256_block_neon ++#endif ++.type sha256_block_neon,%function ++.align 4 ++sha256_block_neon: ++.Lneon_entry: ++ stp x29, x30, [sp, #-16]! ++ mov x29, sp ++ sub sp,sp,#16*4 ++ ++ adr x16,.LK256 ++ add x2,x1,x2,lsl#6 // len to point at the end of inp ++ ++ ld1 {v0.16b},[x1], #16 ++ ld1 {v1.16b},[x1], #16 ++ ld1 {v2.16b},[x1], #16 ++ ld1 {v3.16b},[x1], #16 ++ ld1 {v4.4s},[x16], #16 ++ ld1 {v5.4s},[x16], #16 ++ ld1 {v6.4s},[x16], #16 ++ ld1 {v7.4s},[x16], #16 ++ rev32 v0.16b,v0.16b // yes, even on ++ rev32 v1.16b,v1.16b // big-endian ++ rev32 v2.16b,v2.16b ++ rev32 v3.16b,v3.16b ++ mov x17,sp ++ add v4.4s,v4.4s,v0.4s ++ add v5.4s,v5.4s,v1.4s ++ add v6.4s,v6.4s,v2.4s ++ st1 {v4.4s-v5.4s},[x17], #32 ++ add v7.4s,v7.4s,v3.4s ++ st1 {v6.4s-v7.4s},[x17] ++ sub x17,x17,#32 ++ ++ ldp w3,w4,[x0] ++ ldp w5,w6,[x0,#8] ++ ldp w7,w8,[x0,#16] ++ ldp w9,w10,[x0,#24] ++ ldr w12,[sp,#0] ++ mov w13,wzr ++ eor w14,w4,w5 ++ mov w15,wzr ++ b .L_00_48 ++ ++.align 4 ++.L_00_48: ++ ext v4.16b,v0.16b,v1.16b,#4 ++ add w10,w10,w12 ++ add w3,w3,w15 ++ and w12,w8,w7 ++ bic w15,w9,w7 ++ ext v7.16b,v2.16b,v3.16b,#4 ++ eor w11,w7,w7,ror#5 ++ add w3,w3,w13 ++ mov d19,v3.d[1] ++ orr w12,w12,w15 ++ eor w11,w11,w7,ror#19 ++ ushr v6.4s,v4.4s,#7 ++ eor w15,w3,w3,ror#11 ++ ushr v5.4s,v4.4s,#3 ++ add w10,w10,w12 ++ add v0.4s,v0.4s,v7.4s ++ ror w11,w11,#6 ++ sli v6.4s,v4.4s,#25 ++ eor w13,w3,w4 ++ eor w15,w15,w3,ror#20 ++ ushr v7.4s,v4.4s,#18 ++ add w10,w10,w11 ++ ldr w12,[sp,#4] ++ and w14,w14,w13 ++ eor v5.16b,v5.16b,v6.16b ++ ror w15,w15,#2 ++ add w6,w6,w10 ++ sli v7.4s,v4.4s,#14 ++ eor w14,w14,w4 ++ ushr v16.4s,v19.4s,#17 ++ add w9,w9,w12 ++ add w10,w10,w15 ++ and w12,w7,w6 ++ eor v5.16b,v5.16b,v7.16b ++ bic w15,w8,w6 ++ eor w11,w6,w6,ror#5 ++ sli v16.4s,v19.4s,#15 ++ add w10,w10,w14 ++ orr w12,w12,w15 ++ ushr v17.4s,v19.4s,#10 ++ eor w11,w11,w6,ror#19 ++ eor w15,w10,w10,ror#11 ++ ushr v7.4s,v19.4s,#19 ++ add w9,w9,w12 ++ ror w11,w11,#6 ++ add v0.4s,v0.4s,v5.4s ++ eor w14,w10,w3 ++ eor w15,w15,w10,ror#20 ++ sli v7.4s,v19.4s,#13 ++ add w9,w9,w11 ++ ldr w12,[sp,#8] ++ and w13,w13,w14 ++ eor v17.16b,v17.16b,v16.16b ++ ror w15,w15,#2 ++ add w5,w5,w9 ++ eor w13,w13,w3 ++ eor v17.16b,v17.16b,v7.16b ++ add w8,w8,w12 ++ add w9,w9,w15 ++ and w12,w6,w5 ++ add v0.4s,v0.4s,v17.4s ++ bic w15,w7,w5 ++ eor w11,w5,w5,ror#5 ++ add w9,w9,w13 ++ ushr v18.4s,v0.4s,#17 ++ orr w12,w12,w15 ++ ushr v19.4s,v0.4s,#10 ++ eor w11,w11,w5,ror#19 ++ eor w15,w9,w9,ror#11 ++ sli v18.4s,v0.4s,#15 ++ add w8,w8,w12 ++ ushr v17.4s,v0.4s,#19 ++ ror w11,w11,#6 ++ eor w13,w9,w10 ++ eor v19.16b,v19.16b,v18.16b ++ eor w15,w15,w9,ror#20 ++ add w8,w8,w11 ++ sli v17.4s,v0.4s,#13 ++ ldr w12,[sp,#12] ++ and w14,w14,w13 ++ ror w15,w15,#2 ++ ld1 {v4.4s},[x16], #16 ++ add w4,w4,w8 ++ eor v19.16b,v19.16b,v17.16b ++ eor w14,w14,w10 ++ eor v17.16b,v17.16b,v17.16b ++ add w7,w7,w12 ++ add w8,w8,w15 ++ and w12,w5,w4 ++ mov v17.d[1],v19.d[0] ++ bic w15,w6,w4 ++ eor w11,w4,w4,ror#5 ++ add w8,w8,w14 ++ add v0.4s,v0.4s,v17.4s ++ orr w12,w12,w15 ++ eor w11,w11,w4,ror#19 ++ eor w15,w8,w8,ror#11 ++ add v4.4s,v4.4s,v0.4s ++ add w7,w7,w12 ++ ror w11,w11,#6 ++ eor w14,w8,w9 ++ eor w15,w15,w8,ror#20 ++ add w7,w7,w11 ++ ldr w12,[sp,#16] ++ and w13,w13,w14 ++ ror w15,w15,#2 ++ add w3,w3,w7 ++ eor w13,w13,w9 ++ st1 {v4.4s},[x17], #16 ++ ext v4.16b,v1.16b,v2.16b,#4 ++ add w6,w6,w12 ++ add w7,w7,w15 ++ and w12,w4,w3 ++ bic w15,w5,w3 ++ ext v7.16b,v3.16b,v0.16b,#4 ++ eor w11,w3,w3,ror#5 ++ add w7,w7,w13 ++ mov d19,v0.d[1] ++ orr w12,w12,w15 ++ eor w11,w11,w3,ror#19 ++ ushr v6.4s,v4.4s,#7 ++ eor w15,w7,w7,ror#11 ++ ushr v5.4s,v4.4s,#3 ++ add w6,w6,w12 ++ add v1.4s,v1.4s,v7.4s ++ ror w11,w11,#6 ++ sli v6.4s,v4.4s,#25 ++ eor w13,w7,w8 ++ eor w15,w15,w7,ror#20 ++ ushr v7.4s,v4.4s,#18 ++ add w6,w6,w11 ++ ldr w12,[sp,#20] ++ and w14,w14,w13 ++ eor v5.16b,v5.16b,v6.16b ++ ror w15,w15,#2 ++ add w10,w10,w6 ++ sli v7.4s,v4.4s,#14 ++ eor w14,w14,w8 ++ ushr v16.4s,v19.4s,#17 ++ add w5,w5,w12 ++ add w6,w6,w15 ++ and w12,w3,w10 ++ eor v5.16b,v5.16b,v7.16b ++ bic w15,w4,w10 ++ eor w11,w10,w10,ror#5 ++ sli v16.4s,v19.4s,#15 ++ add w6,w6,w14 ++ orr w12,w12,w15 ++ ushr v17.4s,v19.4s,#10 ++ eor w11,w11,w10,ror#19 ++ eor w15,w6,w6,ror#11 ++ ushr v7.4s,v19.4s,#19 ++ add w5,w5,w12 ++ ror w11,w11,#6 ++ add v1.4s,v1.4s,v5.4s ++ eor w14,w6,w7 ++ eor w15,w15,w6,ror#20 ++ sli v7.4s,v19.4s,#13 ++ add w5,w5,w11 ++ ldr w12,[sp,#24] ++ and w13,w13,w14 ++ eor v17.16b,v17.16b,v16.16b ++ ror w15,w15,#2 ++ add w9,w9,w5 ++ eor w13,w13,w7 ++ eor v17.16b,v17.16b,v7.16b ++ add w4,w4,w12 ++ add w5,w5,w15 ++ and w12,w10,w9 ++ add v1.4s,v1.4s,v17.4s ++ bic w15,w3,w9 ++ eor w11,w9,w9,ror#5 ++ add w5,w5,w13 ++ ushr v18.4s,v1.4s,#17 ++ orr w12,w12,w15 ++ ushr v19.4s,v1.4s,#10 ++ eor w11,w11,w9,ror#19 ++ eor w15,w5,w5,ror#11 ++ sli v18.4s,v1.4s,#15 ++ add w4,w4,w12 ++ ushr v17.4s,v1.4s,#19 ++ ror w11,w11,#6 ++ eor w13,w5,w6 ++ eor v19.16b,v19.16b,v18.16b ++ eor w15,w15,w5,ror#20 ++ add w4,w4,w11 ++ sli v17.4s,v1.4s,#13 ++ ldr w12,[sp,#28] ++ and w14,w14,w13 ++ ror w15,w15,#2 ++ ld1 {v4.4s},[x16], #16 ++ add w8,w8,w4 ++ eor v19.16b,v19.16b,v17.16b ++ eor w14,w14,w6 ++ eor v17.16b,v17.16b,v17.16b ++ add w3,w3,w12 ++ add w4,w4,w15 ++ and w12,w9,w8 ++ mov v17.d[1],v19.d[0] ++ bic w15,w10,w8 ++ eor w11,w8,w8,ror#5 ++ add w4,w4,w14 ++ add v1.4s,v1.4s,v17.4s ++ orr w12,w12,w15 ++ eor w11,w11,w8,ror#19 ++ eor w15,w4,w4,ror#11 ++ add v4.4s,v4.4s,v1.4s ++ add w3,w3,w12 ++ ror w11,w11,#6 ++ eor w14,w4,w5 ++ eor w15,w15,w4,ror#20 ++ add w3,w3,w11 ++ ldr w12,[sp,#32] ++ and w13,w13,w14 ++ ror w15,w15,#2 ++ add w7,w7,w3 ++ eor w13,w13,w5 ++ st1 {v4.4s},[x17], #16 ++ ext v4.16b,v2.16b,v3.16b,#4 ++ add w10,w10,w12 ++ add w3,w3,w15 ++ and w12,w8,w7 ++ bic w15,w9,w7 ++ ext v7.16b,v0.16b,v1.16b,#4 ++ eor w11,w7,w7,ror#5 ++ add w3,w3,w13 ++ mov d19,v1.d[1] ++ orr w12,w12,w15 ++ eor w11,w11,w7,ror#19 ++ ushr v6.4s,v4.4s,#7 ++ eor w15,w3,w3,ror#11 ++ ushr v5.4s,v4.4s,#3 ++ add w10,w10,w12 ++ add v2.4s,v2.4s,v7.4s ++ ror w11,w11,#6 ++ sli v6.4s,v4.4s,#25 ++ eor w13,w3,w4 ++ eor w15,w15,w3,ror#20 ++ ushr v7.4s,v4.4s,#18 ++ add w10,w10,w11 ++ ldr w12,[sp,#36] ++ and w14,w14,w13 ++ eor v5.16b,v5.16b,v6.16b ++ ror w15,w15,#2 ++ add w6,w6,w10 ++ sli v7.4s,v4.4s,#14 ++ eor w14,w14,w4 ++ ushr v16.4s,v19.4s,#17 ++ add w9,w9,w12 ++ add w10,w10,w15 ++ and w12,w7,w6 ++ eor v5.16b,v5.16b,v7.16b ++ bic w15,w8,w6 ++ eor w11,w6,w6,ror#5 ++ sli v16.4s,v19.4s,#15 ++ add w10,w10,w14 ++ orr w12,w12,w15 ++ ushr v17.4s,v19.4s,#10 ++ eor w11,w11,w6,ror#19 ++ eor w15,w10,w10,ror#11 ++ ushr v7.4s,v19.4s,#19 ++ add w9,w9,w12 ++ ror w11,w11,#6 ++ add v2.4s,v2.4s,v5.4s ++ eor w14,w10,w3 ++ eor w15,w15,w10,ror#20 ++ sli v7.4s,v19.4s,#13 ++ add w9,w9,w11 ++ ldr w12,[sp,#40] ++ and w13,w13,w14 ++ eor v17.16b,v17.16b,v16.16b ++ ror w15,w15,#2 ++ add w5,w5,w9 ++ eor w13,w13,w3 ++ eor v17.16b,v17.16b,v7.16b ++ add w8,w8,w12 ++ add w9,w9,w15 ++ and w12,w6,w5 ++ add v2.4s,v2.4s,v17.4s ++ bic w15,w7,w5 ++ eor w11,w5,w5,ror#5 ++ add w9,w9,w13 ++ ushr v18.4s,v2.4s,#17 ++ orr w12,w12,w15 ++ ushr v19.4s,v2.4s,#10 ++ eor w11,w11,w5,ror#19 ++ eor w15,w9,w9,ror#11 ++ sli v18.4s,v2.4s,#15 ++ add w8,w8,w12 ++ ushr v17.4s,v2.4s,#19 ++ ror w11,w11,#6 ++ eor w13,w9,w10 ++ eor v19.16b,v19.16b,v18.16b ++ eor w15,w15,w9,ror#20 ++ add w8,w8,w11 ++ sli v17.4s,v2.4s,#13 ++ ldr w12,[sp,#44] ++ and w14,w14,w13 ++ ror w15,w15,#2 ++ ld1 {v4.4s},[x16], #16 ++ add w4,w4,w8 ++ eor v19.16b,v19.16b,v17.16b ++ eor w14,w14,w10 ++ eor v17.16b,v17.16b,v17.16b ++ add w7,w7,w12 ++ add w8,w8,w15 ++ and w12,w5,w4 ++ mov v17.d[1],v19.d[0] ++ bic w15,w6,w4 ++ eor w11,w4,w4,ror#5 ++ add w8,w8,w14 ++ add v2.4s,v2.4s,v17.4s ++ orr w12,w12,w15 ++ eor w11,w11,w4,ror#19 ++ eor w15,w8,w8,ror#11 ++ add v4.4s,v4.4s,v2.4s ++ add w7,w7,w12 ++ ror w11,w11,#6 ++ eor w14,w8,w9 ++ eor w15,w15,w8,ror#20 ++ add w7,w7,w11 ++ ldr w12,[sp,#48] ++ and w13,w13,w14 ++ ror w15,w15,#2 ++ add w3,w3,w7 ++ eor w13,w13,w9 ++ st1 {v4.4s},[x17], #16 ++ ext v4.16b,v3.16b,v0.16b,#4 ++ add w6,w6,w12 ++ add w7,w7,w15 ++ and w12,w4,w3 ++ bic w15,w5,w3 ++ ext v7.16b,v1.16b,v2.16b,#4 ++ eor w11,w3,w3,ror#5 ++ add w7,w7,w13 ++ mov d19,v2.d[1] ++ orr w12,w12,w15 ++ eor w11,w11,w3,ror#19 ++ ushr v6.4s,v4.4s,#7 ++ eor w15,w7,w7,ror#11 ++ ushr v5.4s,v4.4s,#3 ++ add w6,w6,w12 ++ add v3.4s,v3.4s,v7.4s ++ ror w11,w11,#6 ++ sli v6.4s,v4.4s,#25 ++ eor w13,w7,w8 ++ eor w15,w15,w7,ror#20 ++ ushr v7.4s,v4.4s,#18 ++ add w6,w6,w11 ++ ldr w12,[sp,#52] ++ and w14,w14,w13 ++ eor v5.16b,v5.16b,v6.16b ++ ror w15,w15,#2 ++ add w10,w10,w6 ++ sli v7.4s,v4.4s,#14 ++ eor w14,w14,w8 ++ ushr v16.4s,v19.4s,#17 ++ add w5,w5,w12 ++ add w6,w6,w15 ++ and w12,w3,w10 ++ eor v5.16b,v5.16b,v7.16b ++ bic w15,w4,w10 ++ eor w11,w10,w10,ror#5 ++ sli v16.4s,v19.4s,#15 ++ add w6,w6,w14 ++ orr w12,w12,w15 ++ ushr v17.4s,v19.4s,#10 ++ eor w11,w11,w10,ror#19 ++ eor w15,w6,w6,ror#11 ++ ushr v7.4s,v19.4s,#19 ++ add w5,w5,w12 ++ ror w11,w11,#6 ++ add v3.4s,v3.4s,v5.4s ++ eor w14,w6,w7 ++ eor w15,w15,w6,ror#20 ++ sli v7.4s,v19.4s,#13 ++ add w5,w5,w11 ++ ldr w12,[sp,#56] ++ and w13,w13,w14 ++ eor v17.16b,v17.16b,v16.16b ++ ror w15,w15,#2 ++ add w9,w9,w5 ++ eor w13,w13,w7 ++ eor v17.16b,v17.16b,v7.16b ++ add w4,w4,w12 ++ add w5,w5,w15 ++ and w12,w10,w9 ++ add v3.4s,v3.4s,v17.4s ++ bic w15,w3,w9 ++ eor w11,w9,w9,ror#5 ++ add w5,w5,w13 ++ ushr v18.4s,v3.4s,#17 ++ orr w12,w12,w15 ++ ushr v19.4s,v3.4s,#10 ++ eor w11,w11,w9,ror#19 ++ eor w15,w5,w5,ror#11 ++ sli v18.4s,v3.4s,#15 ++ add w4,w4,w12 ++ ushr v17.4s,v3.4s,#19 ++ ror w11,w11,#6 ++ eor w13,w5,w6 ++ eor v19.16b,v19.16b,v18.16b ++ eor w15,w15,w5,ror#20 ++ add w4,w4,w11 ++ sli v17.4s,v3.4s,#13 ++ ldr w12,[sp,#60] ++ and w14,w14,w13 ++ ror w15,w15,#2 ++ ld1 {v4.4s},[x16], #16 ++ add w8,w8,w4 ++ eor v19.16b,v19.16b,v17.16b ++ eor w14,w14,w6 ++ eor v17.16b,v17.16b,v17.16b ++ add w3,w3,w12 ++ add w4,w4,w15 ++ and w12,w9,w8 ++ mov v17.d[1],v19.d[0] ++ bic w15,w10,w8 ++ eor w11,w8,w8,ror#5 ++ add w4,w4,w14 ++ add v3.4s,v3.4s,v17.4s ++ orr w12,w12,w15 ++ eor w11,w11,w8,ror#19 ++ eor w15,w4,w4,ror#11 ++ add v4.4s,v4.4s,v3.4s ++ add w3,w3,w12 ++ ror w11,w11,#6 ++ eor w14,w4,w5 ++ eor w15,w15,w4,ror#20 ++ add w3,w3,w11 ++ ldr w12,[x16] ++ and w13,w13,w14 ++ ror w15,w15,#2 ++ add w7,w7,w3 ++ eor w13,w13,w5 ++ st1 {v4.4s},[x17], #16 ++ cmp w12,#0 // check for K256 terminator ++ ldr w12,[sp,#0] ++ sub x17,x17,#64 ++ bne .L_00_48 ++ ++ sub x16,x16,#256 // rewind x16 ++ cmp x1,x2 ++ mov x17, #64 ++ csel x17, x17, xzr, eq ++ sub x1,x1,x17 // avoid SEGV ++ mov x17,sp ++ add w10,w10,w12 ++ add w3,w3,w15 ++ and w12,w8,w7 ++ ld1 {v0.16b},[x1],#16 ++ bic w15,w9,w7 ++ eor w11,w7,w7,ror#5 ++ ld1 {v4.4s},[x16],#16 ++ add w3,w3,w13 ++ orr w12,w12,w15 ++ eor w11,w11,w7,ror#19 ++ eor w15,w3,w3,ror#11 ++ rev32 v0.16b,v0.16b ++ add w10,w10,w12 ++ ror w11,w11,#6 ++ eor w13,w3,w4 ++ eor w15,w15,w3,ror#20 ++ add v4.4s,v4.4s,v0.4s ++ add w10,w10,w11 ++ ldr w12,[sp,#4] ++ and w14,w14,w13 ++ ror w15,w15,#2 ++ add w6,w6,w10 ++ eor w14,w14,w4 ++ add w9,w9,w12 ++ add w10,w10,w15 ++ and w12,w7,w6 ++ bic w15,w8,w6 ++ eor w11,w6,w6,ror#5 ++ add w10,w10,w14 ++ orr w12,w12,w15 ++ eor w11,w11,w6,ror#19 ++ eor w15,w10,w10,ror#11 ++ add w9,w9,w12 ++ ror w11,w11,#6 ++ eor w14,w10,w3 ++ eor w15,w15,w10,ror#20 ++ add w9,w9,w11 ++ ldr w12,[sp,#8] ++ and w13,w13,w14 ++ ror w15,w15,#2 ++ add w5,w5,w9 ++ eor w13,w13,w3 ++ add w8,w8,w12 ++ add w9,w9,w15 ++ and w12,w6,w5 ++ bic w15,w7,w5 ++ eor w11,w5,w5,ror#5 ++ add w9,w9,w13 ++ orr w12,w12,w15 ++ eor w11,w11,w5,ror#19 ++ eor w15,w9,w9,ror#11 ++ add w8,w8,w12 ++ ror w11,w11,#6 ++ eor w13,w9,w10 ++ eor w15,w15,w9,ror#20 ++ add w8,w8,w11 ++ ldr w12,[sp,#12] ++ and w14,w14,w13 ++ ror w15,w15,#2 ++ add w4,w4,w8 ++ eor w14,w14,w10 ++ add w7,w7,w12 ++ add w8,w8,w15 ++ and w12,w5,w4 ++ bic w15,w6,w4 ++ eor w11,w4,w4,ror#5 ++ add w8,w8,w14 ++ orr w12,w12,w15 ++ eor w11,w11,w4,ror#19 ++ eor w15,w8,w8,ror#11 ++ add w7,w7,w12 ++ ror w11,w11,#6 ++ eor w14,w8,w9 ++ eor w15,w15,w8,ror#20 ++ add w7,w7,w11 ++ ldr w12,[sp,#16] ++ and w13,w13,w14 ++ ror w15,w15,#2 ++ add w3,w3,w7 ++ eor w13,w13,w9 ++ st1 {v4.4s},[x17], #16 ++ add w6,w6,w12 ++ add w7,w7,w15 ++ and w12,w4,w3 ++ ld1 {v1.16b},[x1],#16 ++ bic w15,w5,w3 ++ eor w11,w3,w3,ror#5 ++ ld1 {v4.4s},[x16],#16 ++ add w7,w7,w13 ++ orr w12,w12,w15 ++ eor w11,w11,w3,ror#19 ++ eor w15,w7,w7,ror#11 ++ rev32 v1.16b,v1.16b ++ add w6,w6,w12 ++ ror w11,w11,#6 ++ eor w13,w7,w8 ++ eor w15,w15,w7,ror#20 ++ add v4.4s,v4.4s,v1.4s ++ add w6,w6,w11 ++ ldr w12,[sp,#20] ++ and w14,w14,w13 ++ ror w15,w15,#2 ++ add w10,w10,w6 ++ eor w14,w14,w8 ++ add w5,w5,w12 ++ add w6,w6,w15 ++ and w12,w3,w10 ++ bic w15,w4,w10 ++ eor w11,w10,w10,ror#5 ++ add w6,w6,w14 ++ orr w12,w12,w15 ++ eor w11,w11,w10,ror#19 ++ eor w15,w6,w6,ror#11 ++ add w5,w5,w12 ++ ror w11,w11,#6 ++ eor w14,w6,w7 ++ eor w15,w15,w6,ror#20 ++ add w5,w5,w11 ++ ldr w12,[sp,#24] ++ and w13,w13,w14 ++ ror w15,w15,#2 ++ add w9,w9,w5 ++ eor w13,w13,w7 ++ add w4,w4,w12 ++ add w5,w5,w15 ++ and w12,w10,w9 ++ bic w15,w3,w9 ++ eor w11,w9,w9,ror#5 ++ add w5,w5,w13 ++ orr w12,w12,w15 ++ eor w11,w11,w9,ror#19 ++ eor w15,w5,w5,ror#11 ++ add w4,w4,w12 ++ ror w11,w11,#6 ++ eor w13,w5,w6 ++ eor w15,w15,w5,ror#20 ++ add w4,w4,w11 ++ ldr w12,[sp,#28] ++ and w14,w14,w13 ++ ror w15,w15,#2 ++ add w8,w8,w4 ++ eor w14,w14,w6 ++ add w3,w3,w12 ++ add w4,w4,w15 ++ and w12,w9,w8 ++ bic w15,w10,w8 ++ eor w11,w8,w8,ror#5 ++ add w4,w4,w14 ++ orr w12,w12,w15 ++ eor w11,w11,w8,ror#19 ++ eor w15,w4,w4,ror#11 ++ add w3,w3,w12 ++ ror w11,w11,#6 ++ eor w14,w4,w5 ++ eor w15,w15,w4,ror#20 ++ add w3,w3,w11 ++ ldr w12,[sp,#32] ++ and w13,w13,w14 ++ ror w15,w15,#2 ++ add w7,w7,w3 ++ eor w13,w13,w5 ++ st1 {v4.4s},[x17], #16 ++ add w10,w10,w12 ++ add w3,w3,w15 ++ and w12,w8,w7 ++ ld1 {v2.16b},[x1],#16 ++ bic w15,w9,w7 ++ eor w11,w7,w7,ror#5 ++ ld1 {v4.4s},[x16],#16 ++ add w3,w3,w13 ++ orr w12,w12,w15 ++ eor w11,w11,w7,ror#19 ++ eor w15,w3,w3,ror#11 ++ rev32 v2.16b,v2.16b ++ add w10,w10,w12 ++ ror w11,w11,#6 ++ eor w13,w3,w4 ++ eor w15,w15,w3,ror#20 ++ add v4.4s,v4.4s,v2.4s ++ add w10,w10,w11 ++ ldr w12,[sp,#36] ++ and w14,w14,w13 ++ ror w15,w15,#2 ++ add w6,w6,w10 ++ eor w14,w14,w4 ++ add w9,w9,w12 ++ add w10,w10,w15 ++ and w12,w7,w6 ++ bic w15,w8,w6 ++ eor w11,w6,w6,ror#5 ++ add w10,w10,w14 ++ orr w12,w12,w15 ++ eor w11,w11,w6,ror#19 ++ eor w15,w10,w10,ror#11 ++ add w9,w9,w12 ++ ror w11,w11,#6 ++ eor w14,w10,w3 ++ eor w15,w15,w10,ror#20 ++ add w9,w9,w11 ++ ldr w12,[sp,#40] ++ and w13,w13,w14 ++ ror w15,w15,#2 ++ add w5,w5,w9 ++ eor w13,w13,w3 ++ add w8,w8,w12 ++ add w9,w9,w15 ++ and w12,w6,w5 ++ bic w15,w7,w5 ++ eor w11,w5,w5,ror#5 ++ add w9,w9,w13 ++ orr w12,w12,w15 ++ eor w11,w11,w5,ror#19 ++ eor w15,w9,w9,ror#11 ++ add w8,w8,w12 ++ ror w11,w11,#6 ++ eor w13,w9,w10 ++ eor w15,w15,w9,ror#20 ++ add w8,w8,w11 ++ ldr w12,[sp,#44] ++ and w14,w14,w13 ++ ror w15,w15,#2 ++ add w4,w4,w8 ++ eor w14,w14,w10 ++ add w7,w7,w12 ++ add w8,w8,w15 ++ and w12,w5,w4 ++ bic w15,w6,w4 ++ eor w11,w4,w4,ror#5 ++ add w8,w8,w14 ++ orr w12,w12,w15 ++ eor w11,w11,w4,ror#19 ++ eor w15,w8,w8,ror#11 ++ add w7,w7,w12 ++ ror w11,w11,#6 ++ eor w14,w8,w9 ++ eor w15,w15,w8,ror#20 ++ add w7,w7,w11 ++ ldr w12,[sp,#48] ++ and w13,w13,w14 ++ ror w15,w15,#2 ++ add w3,w3,w7 ++ eor w13,w13,w9 ++ st1 {v4.4s},[x17], #16 ++ add w6,w6,w12 ++ add w7,w7,w15 ++ and w12,w4,w3 ++ ld1 {v3.16b},[x1],#16 ++ bic w15,w5,w3 ++ eor w11,w3,w3,ror#5 ++ ld1 {v4.4s},[x16],#16 ++ add w7,w7,w13 ++ orr w12,w12,w15 ++ eor w11,w11,w3,ror#19 ++ eor w15,w7,w7,ror#11 ++ rev32 v3.16b,v3.16b ++ add w6,w6,w12 ++ ror w11,w11,#6 ++ eor w13,w7,w8 ++ eor w15,w15,w7,ror#20 ++ add v4.4s,v4.4s,v3.4s ++ add w6,w6,w11 ++ ldr w12,[sp,#52] ++ and w14,w14,w13 ++ ror w15,w15,#2 ++ add w10,w10,w6 ++ eor w14,w14,w8 ++ add w5,w5,w12 ++ add w6,w6,w15 ++ and w12,w3,w10 ++ bic w15,w4,w10 ++ eor w11,w10,w10,ror#5 ++ add w6,w6,w14 ++ orr w12,w12,w15 ++ eor w11,w11,w10,ror#19 ++ eor w15,w6,w6,ror#11 ++ add w5,w5,w12 ++ ror w11,w11,#6 ++ eor w14,w6,w7 ++ eor w15,w15,w6,ror#20 ++ add w5,w5,w11 ++ ldr w12,[sp,#56] ++ and w13,w13,w14 ++ ror w15,w15,#2 ++ add w9,w9,w5 ++ eor w13,w13,w7 ++ add w4,w4,w12 ++ add w5,w5,w15 ++ and w12,w10,w9 ++ bic w15,w3,w9 ++ eor w11,w9,w9,ror#5 ++ add w5,w5,w13 ++ orr w12,w12,w15 ++ eor w11,w11,w9,ror#19 ++ eor w15,w5,w5,ror#11 ++ add w4,w4,w12 ++ ror w11,w11,#6 ++ eor w13,w5,w6 ++ eor w15,w15,w5,ror#20 ++ add w4,w4,w11 ++ ldr w12,[sp,#60] ++ and w14,w14,w13 ++ ror w15,w15,#2 ++ add w8,w8,w4 ++ eor w14,w14,w6 ++ add w3,w3,w12 ++ add w4,w4,w15 ++ and w12,w9,w8 ++ bic w15,w10,w8 ++ eor w11,w8,w8,ror#5 ++ add w4,w4,w14 ++ orr w12,w12,w15 ++ eor w11,w11,w8,ror#19 ++ eor w15,w4,w4,ror#11 ++ add w3,w3,w12 ++ ror w11,w11,#6 ++ eor w14,w4,w5 ++ eor w15,w15,w4,ror#20 ++ add w3,w3,w11 ++ and w13,w13,w14 ++ ror w15,w15,#2 ++ add w7,w7,w3 ++ eor w13,w13,w5 ++ st1 {v4.4s},[x17], #16 ++ add w3,w3,w15 // h+=Sigma0(a) from the past ++ ldp w11,w12,[x0,#0] ++ add w3,w3,w13 // h+=Maj(a,b,c) from the past ++ ldp w13,w14,[x0,#8] ++ add w3,w3,w11 // accumulate ++ add w4,w4,w12 ++ ldp w11,w12,[x0,#16] ++ add w5,w5,w13 ++ add w6,w6,w14 ++ ldp w13,w14,[x0,#24] ++ add w7,w7,w11 ++ add w8,w8,w12 ++ ldr w12,[sp,#0] ++ stp w3,w4,[x0,#0] ++ add w9,w9,w13 ++ mov w13,wzr ++ stp w5,w6,[x0,#8] ++ add w10,w10,w14 ++ stp w7,w8,[x0,#16] ++ eor w14,w4,w5 ++ stp w9,w10,[x0,#24] ++ mov w15,wzr ++ mov x17,sp ++ b.ne .L_00_48 ++ ++ ldr x29,[x29] ++ add sp,sp,#16*4+16 ++ ret ++.size sha256_block_neon,.-sha256_block_neon ++#ifndef __KERNEL__ ++.comm OPENSSL_armcap_P,4,4 ++#endif +diff --git a/arch/arm64/crypto/sha512-core.S b/arch/arm64/crypto/sha512-core.S +new file mode 100644 +index 000000000000..bd0f59f06c9d +--- /dev/null ++++ b/arch/arm64/crypto/sha512-core.S +@@ -0,0 +1,1085 @@ ++// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. ++// ++// Licensed under the OpenSSL license (the "License"). You may not use ++// this file except in compliance with the License. You can obtain a copy ++// in the file LICENSE in the source distribution or at ++// https://www.openssl.org/source/license.html ++ ++// ==================================================================== ++// Written by Andy Polyakov for the OpenSSL ++// project. The module is, however, dual licensed under OpenSSL and ++// CRYPTOGAMS licenses depending on where you obtain it. For further ++// details see http://www.openssl.org/~appro/cryptogams/. ++// ++// Permission to use under GPLv2 terms is granted. ++// ==================================================================== ++// ++// SHA256/512 for ARMv8. ++// ++// Performance in cycles per processed byte and improvement coefficient ++// over code generated with "default" compiler: ++// ++// SHA256-hw SHA256(*) SHA512 ++// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) ++// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) ++// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) ++// Denver 2.01 10.5 (+26%) 6.70 (+8%) ++// X-Gene 20.0 (+100%) 12.8 (+300%(***)) ++// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) ++// ++// (*) Software SHA256 results are of lesser relevance, presented ++// mostly for informational purposes. ++// (**) The result is a trade-off: it's possible to improve it by ++// 10% (or by 1 cycle per round), but at the cost of 20% loss ++// on Cortex-A53 (or by 4 cycles per round). ++// (***) Super-impressive coefficients over gcc-generated code are ++// indication of some compiler "pathology", most notably code ++// generated with -mgeneral-regs-only is significanty faster ++// and the gap is only 40-90%. ++// ++// October 2016. ++// ++// Originally it was reckoned that it makes no sense to implement NEON ++// version of SHA256 for 64-bit processors. This is because performance ++// improvement on most wide-spread Cortex-A5x processors was observed ++// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was ++// observed that 32-bit NEON SHA256 performs significantly better than ++// 64-bit scalar version on *some* of the more recent processors. As ++// result 64-bit NEON version of SHA256 was added to provide best ++// all-round performance. For example it executes ~30% faster on X-Gene ++// and Mongoose. [For reference, NEON version of SHA512 is bound to ++// deliver much less improvement, likely *negative* on Cortex-A5x. ++// Which is why NEON support is limited to SHA256.] ++ ++#ifndef __KERNEL__ ++# include "arm_arch.h" ++#endif ++ ++.text ++ ++.extern OPENSSL_armcap_P ++.globl sha512_block_data_order ++.type sha512_block_data_order,%function ++.align 6 ++sha512_block_data_order: ++ stp x29,x30,[sp,#-128]! ++ add x29,sp,#0 ++ ++ stp x19,x20,[sp,#16] ++ stp x21,x22,[sp,#32] ++ stp x23,x24,[sp,#48] ++ stp x25,x26,[sp,#64] ++ stp x27,x28,[sp,#80] ++ sub sp,sp,#4*8 ++ ++ ldp x20,x21,[x0] // load context ++ ldp x22,x23,[x0,#2*8] ++ ldp x24,x25,[x0,#4*8] ++ add x2,x1,x2,lsl#7 // end of input ++ ldp x26,x27,[x0,#6*8] ++ adr x30,.LK512 ++ stp x0,x2,[x29,#96] ++ ++.Loop: ++ ldp x3,x4,[x1],#2*8 ++ ldr x19,[x30],#8 // *K++ ++ eor x28,x21,x22 // magic seed ++ str x1,[x29,#112] ++#ifndef __AARCH64EB__ ++ rev x3,x3 // 0 ++#endif ++ ror x16,x24,#14 ++ add x27,x27,x19 // h+=K[i] ++ eor x6,x24,x24,ror#23 ++ and x17,x25,x24 ++ bic x19,x26,x24 ++ add x27,x27,x3 // h+=X[i] ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x20,x21 // a^b, b^c in next round ++ eor x16,x16,x6,ror#18 // Sigma1(e) ++ ror x6,x20,#28 ++ add x27,x27,x17 // h+=Ch(e,f,g) ++ eor x17,x20,x20,ror#5 ++ add x27,x27,x16 // h+=Sigma1(e) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ add x23,x23,x27 // d+=h ++ eor x28,x28,x21 // Maj(a,b,c) ++ eor x17,x6,x17,ror#34 // Sigma0(a) ++ add x27,x27,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ //add x27,x27,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x4,x4 // 1 ++#endif ++ ldp x5,x6,[x1],#2*8 ++ add x27,x27,x17 // h+=Sigma0(a) ++ ror x16,x23,#14 ++ add x26,x26,x28 // h+=K[i] ++ eor x7,x23,x23,ror#23 ++ and x17,x24,x23 ++ bic x28,x25,x23 ++ add x26,x26,x4 // h+=X[i] ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x27,x20 // a^b, b^c in next round ++ eor x16,x16,x7,ror#18 // Sigma1(e) ++ ror x7,x27,#28 ++ add x26,x26,x17 // h+=Ch(e,f,g) ++ eor x17,x27,x27,ror#5 ++ add x26,x26,x16 // h+=Sigma1(e) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ add x22,x22,x26 // d+=h ++ eor x19,x19,x20 // Maj(a,b,c) ++ eor x17,x7,x17,ror#34 // Sigma0(a) ++ add x26,x26,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ //add x26,x26,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x5,x5 // 2 ++#endif ++ add x26,x26,x17 // h+=Sigma0(a) ++ ror x16,x22,#14 ++ add x25,x25,x19 // h+=K[i] ++ eor x8,x22,x22,ror#23 ++ and x17,x23,x22 ++ bic x19,x24,x22 ++ add x25,x25,x5 // h+=X[i] ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x26,x27 // a^b, b^c in next round ++ eor x16,x16,x8,ror#18 // Sigma1(e) ++ ror x8,x26,#28 ++ add x25,x25,x17 // h+=Ch(e,f,g) ++ eor x17,x26,x26,ror#5 ++ add x25,x25,x16 // h+=Sigma1(e) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ add x21,x21,x25 // d+=h ++ eor x28,x28,x27 // Maj(a,b,c) ++ eor x17,x8,x17,ror#34 // Sigma0(a) ++ add x25,x25,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ //add x25,x25,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x6,x6 // 3 ++#endif ++ ldp x7,x8,[x1],#2*8 ++ add x25,x25,x17 // h+=Sigma0(a) ++ ror x16,x21,#14 ++ add x24,x24,x28 // h+=K[i] ++ eor x9,x21,x21,ror#23 ++ and x17,x22,x21 ++ bic x28,x23,x21 ++ add x24,x24,x6 // h+=X[i] ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x25,x26 // a^b, b^c in next round ++ eor x16,x16,x9,ror#18 // Sigma1(e) ++ ror x9,x25,#28 ++ add x24,x24,x17 // h+=Ch(e,f,g) ++ eor x17,x25,x25,ror#5 ++ add x24,x24,x16 // h+=Sigma1(e) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ add x20,x20,x24 // d+=h ++ eor x19,x19,x26 // Maj(a,b,c) ++ eor x17,x9,x17,ror#34 // Sigma0(a) ++ add x24,x24,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ //add x24,x24,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x7,x7 // 4 ++#endif ++ add x24,x24,x17 // h+=Sigma0(a) ++ ror x16,x20,#14 ++ add x23,x23,x19 // h+=K[i] ++ eor x10,x20,x20,ror#23 ++ and x17,x21,x20 ++ bic x19,x22,x20 ++ add x23,x23,x7 // h+=X[i] ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x24,x25 // a^b, b^c in next round ++ eor x16,x16,x10,ror#18 // Sigma1(e) ++ ror x10,x24,#28 ++ add x23,x23,x17 // h+=Ch(e,f,g) ++ eor x17,x24,x24,ror#5 ++ add x23,x23,x16 // h+=Sigma1(e) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ add x27,x27,x23 // d+=h ++ eor x28,x28,x25 // Maj(a,b,c) ++ eor x17,x10,x17,ror#34 // Sigma0(a) ++ add x23,x23,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ //add x23,x23,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x8,x8 // 5 ++#endif ++ ldp x9,x10,[x1],#2*8 ++ add x23,x23,x17 // h+=Sigma0(a) ++ ror x16,x27,#14 ++ add x22,x22,x28 // h+=K[i] ++ eor x11,x27,x27,ror#23 ++ and x17,x20,x27 ++ bic x28,x21,x27 ++ add x22,x22,x8 // h+=X[i] ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x23,x24 // a^b, b^c in next round ++ eor x16,x16,x11,ror#18 // Sigma1(e) ++ ror x11,x23,#28 ++ add x22,x22,x17 // h+=Ch(e,f,g) ++ eor x17,x23,x23,ror#5 ++ add x22,x22,x16 // h+=Sigma1(e) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ add x26,x26,x22 // d+=h ++ eor x19,x19,x24 // Maj(a,b,c) ++ eor x17,x11,x17,ror#34 // Sigma0(a) ++ add x22,x22,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ //add x22,x22,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x9,x9 // 6 ++#endif ++ add x22,x22,x17 // h+=Sigma0(a) ++ ror x16,x26,#14 ++ add x21,x21,x19 // h+=K[i] ++ eor x12,x26,x26,ror#23 ++ and x17,x27,x26 ++ bic x19,x20,x26 ++ add x21,x21,x9 // h+=X[i] ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x22,x23 // a^b, b^c in next round ++ eor x16,x16,x12,ror#18 // Sigma1(e) ++ ror x12,x22,#28 ++ add x21,x21,x17 // h+=Ch(e,f,g) ++ eor x17,x22,x22,ror#5 ++ add x21,x21,x16 // h+=Sigma1(e) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ add x25,x25,x21 // d+=h ++ eor x28,x28,x23 // Maj(a,b,c) ++ eor x17,x12,x17,ror#34 // Sigma0(a) ++ add x21,x21,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ //add x21,x21,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x10,x10 // 7 ++#endif ++ ldp x11,x12,[x1],#2*8 ++ add x21,x21,x17 // h+=Sigma0(a) ++ ror x16,x25,#14 ++ add x20,x20,x28 // h+=K[i] ++ eor x13,x25,x25,ror#23 ++ and x17,x26,x25 ++ bic x28,x27,x25 ++ add x20,x20,x10 // h+=X[i] ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x21,x22 // a^b, b^c in next round ++ eor x16,x16,x13,ror#18 // Sigma1(e) ++ ror x13,x21,#28 ++ add x20,x20,x17 // h+=Ch(e,f,g) ++ eor x17,x21,x21,ror#5 ++ add x20,x20,x16 // h+=Sigma1(e) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ add x24,x24,x20 // d+=h ++ eor x19,x19,x22 // Maj(a,b,c) ++ eor x17,x13,x17,ror#34 // Sigma0(a) ++ add x20,x20,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ //add x20,x20,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x11,x11 // 8 ++#endif ++ add x20,x20,x17 // h+=Sigma0(a) ++ ror x16,x24,#14 ++ add x27,x27,x19 // h+=K[i] ++ eor x14,x24,x24,ror#23 ++ and x17,x25,x24 ++ bic x19,x26,x24 ++ add x27,x27,x11 // h+=X[i] ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x20,x21 // a^b, b^c in next round ++ eor x16,x16,x14,ror#18 // Sigma1(e) ++ ror x14,x20,#28 ++ add x27,x27,x17 // h+=Ch(e,f,g) ++ eor x17,x20,x20,ror#5 ++ add x27,x27,x16 // h+=Sigma1(e) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ add x23,x23,x27 // d+=h ++ eor x28,x28,x21 // Maj(a,b,c) ++ eor x17,x14,x17,ror#34 // Sigma0(a) ++ add x27,x27,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ //add x27,x27,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x12,x12 // 9 ++#endif ++ ldp x13,x14,[x1],#2*8 ++ add x27,x27,x17 // h+=Sigma0(a) ++ ror x16,x23,#14 ++ add x26,x26,x28 // h+=K[i] ++ eor x15,x23,x23,ror#23 ++ and x17,x24,x23 ++ bic x28,x25,x23 ++ add x26,x26,x12 // h+=X[i] ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x27,x20 // a^b, b^c in next round ++ eor x16,x16,x15,ror#18 // Sigma1(e) ++ ror x15,x27,#28 ++ add x26,x26,x17 // h+=Ch(e,f,g) ++ eor x17,x27,x27,ror#5 ++ add x26,x26,x16 // h+=Sigma1(e) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ add x22,x22,x26 // d+=h ++ eor x19,x19,x20 // Maj(a,b,c) ++ eor x17,x15,x17,ror#34 // Sigma0(a) ++ add x26,x26,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ //add x26,x26,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x13,x13 // 10 ++#endif ++ add x26,x26,x17 // h+=Sigma0(a) ++ ror x16,x22,#14 ++ add x25,x25,x19 // h+=K[i] ++ eor x0,x22,x22,ror#23 ++ and x17,x23,x22 ++ bic x19,x24,x22 ++ add x25,x25,x13 // h+=X[i] ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x26,x27 // a^b, b^c in next round ++ eor x16,x16,x0,ror#18 // Sigma1(e) ++ ror x0,x26,#28 ++ add x25,x25,x17 // h+=Ch(e,f,g) ++ eor x17,x26,x26,ror#5 ++ add x25,x25,x16 // h+=Sigma1(e) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ add x21,x21,x25 // d+=h ++ eor x28,x28,x27 // Maj(a,b,c) ++ eor x17,x0,x17,ror#34 // Sigma0(a) ++ add x25,x25,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ //add x25,x25,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x14,x14 // 11 ++#endif ++ ldp x15,x0,[x1],#2*8 ++ add x25,x25,x17 // h+=Sigma0(a) ++ str x6,[sp,#24] ++ ror x16,x21,#14 ++ add x24,x24,x28 // h+=K[i] ++ eor x6,x21,x21,ror#23 ++ and x17,x22,x21 ++ bic x28,x23,x21 ++ add x24,x24,x14 // h+=X[i] ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x25,x26 // a^b, b^c in next round ++ eor x16,x16,x6,ror#18 // Sigma1(e) ++ ror x6,x25,#28 ++ add x24,x24,x17 // h+=Ch(e,f,g) ++ eor x17,x25,x25,ror#5 ++ add x24,x24,x16 // h+=Sigma1(e) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ add x20,x20,x24 // d+=h ++ eor x19,x19,x26 // Maj(a,b,c) ++ eor x17,x6,x17,ror#34 // Sigma0(a) ++ add x24,x24,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ //add x24,x24,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x15,x15 // 12 ++#endif ++ add x24,x24,x17 // h+=Sigma0(a) ++ str x7,[sp,#0] ++ ror x16,x20,#14 ++ add x23,x23,x19 // h+=K[i] ++ eor x7,x20,x20,ror#23 ++ and x17,x21,x20 ++ bic x19,x22,x20 ++ add x23,x23,x15 // h+=X[i] ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x24,x25 // a^b, b^c in next round ++ eor x16,x16,x7,ror#18 // Sigma1(e) ++ ror x7,x24,#28 ++ add x23,x23,x17 // h+=Ch(e,f,g) ++ eor x17,x24,x24,ror#5 ++ add x23,x23,x16 // h+=Sigma1(e) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ add x27,x27,x23 // d+=h ++ eor x28,x28,x25 // Maj(a,b,c) ++ eor x17,x7,x17,ror#34 // Sigma0(a) ++ add x23,x23,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ //add x23,x23,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x0,x0 // 13 ++#endif ++ ldp x1,x2,[x1] ++ add x23,x23,x17 // h+=Sigma0(a) ++ str x8,[sp,#8] ++ ror x16,x27,#14 ++ add x22,x22,x28 // h+=K[i] ++ eor x8,x27,x27,ror#23 ++ and x17,x20,x27 ++ bic x28,x21,x27 ++ add x22,x22,x0 // h+=X[i] ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x23,x24 // a^b, b^c in next round ++ eor x16,x16,x8,ror#18 // Sigma1(e) ++ ror x8,x23,#28 ++ add x22,x22,x17 // h+=Ch(e,f,g) ++ eor x17,x23,x23,ror#5 ++ add x22,x22,x16 // h+=Sigma1(e) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ add x26,x26,x22 // d+=h ++ eor x19,x19,x24 // Maj(a,b,c) ++ eor x17,x8,x17,ror#34 // Sigma0(a) ++ add x22,x22,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ //add x22,x22,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x1,x1 // 14 ++#endif ++ ldr x6,[sp,#24] ++ add x22,x22,x17 // h+=Sigma0(a) ++ str x9,[sp,#16] ++ ror x16,x26,#14 ++ add x21,x21,x19 // h+=K[i] ++ eor x9,x26,x26,ror#23 ++ and x17,x27,x26 ++ bic x19,x20,x26 ++ add x21,x21,x1 // h+=X[i] ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x22,x23 // a^b, b^c in next round ++ eor x16,x16,x9,ror#18 // Sigma1(e) ++ ror x9,x22,#28 ++ add x21,x21,x17 // h+=Ch(e,f,g) ++ eor x17,x22,x22,ror#5 ++ add x21,x21,x16 // h+=Sigma1(e) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ add x25,x25,x21 // d+=h ++ eor x28,x28,x23 // Maj(a,b,c) ++ eor x17,x9,x17,ror#34 // Sigma0(a) ++ add x21,x21,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ //add x21,x21,x17 // h+=Sigma0(a) ++#ifndef __AARCH64EB__ ++ rev x2,x2 // 15 ++#endif ++ ldr x7,[sp,#0] ++ add x21,x21,x17 // h+=Sigma0(a) ++ str x10,[sp,#24] ++ ror x16,x25,#14 ++ add x20,x20,x28 // h+=K[i] ++ ror x9,x4,#1 ++ and x17,x26,x25 ++ ror x8,x1,#19 ++ bic x28,x27,x25 ++ ror x10,x21,#28 ++ add x20,x20,x2 // h+=X[i] ++ eor x16,x16,x25,ror#18 ++ eor x9,x9,x4,ror#8 ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x21,x22 // a^b, b^c in next round ++ eor x16,x16,x25,ror#41 // Sigma1(e) ++ eor x10,x10,x21,ror#34 ++ add x20,x20,x17 // h+=Ch(e,f,g) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ eor x8,x8,x1,ror#61 ++ eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) ++ add x20,x20,x16 // h+=Sigma1(e) ++ eor x19,x19,x22 // Maj(a,b,c) ++ eor x17,x10,x21,ror#39 // Sigma0(a) ++ eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) ++ add x3,x3,x12 ++ add x24,x24,x20 // d+=h ++ add x20,x20,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ add x3,x3,x9 ++ add x20,x20,x17 // h+=Sigma0(a) ++ add x3,x3,x8 ++.Loop_16_xx: ++ ldr x8,[sp,#8] ++ str x11,[sp,#0] ++ ror x16,x24,#14 ++ add x27,x27,x19 // h+=K[i] ++ ror x10,x5,#1 ++ and x17,x25,x24 ++ ror x9,x2,#19 ++ bic x19,x26,x24 ++ ror x11,x20,#28 ++ add x27,x27,x3 // h+=X[i] ++ eor x16,x16,x24,ror#18 ++ eor x10,x10,x5,ror#8 ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x20,x21 // a^b, b^c in next round ++ eor x16,x16,x24,ror#41 // Sigma1(e) ++ eor x11,x11,x20,ror#34 ++ add x27,x27,x17 // h+=Ch(e,f,g) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ eor x9,x9,x2,ror#61 ++ eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) ++ add x27,x27,x16 // h+=Sigma1(e) ++ eor x28,x28,x21 // Maj(a,b,c) ++ eor x17,x11,x20,ror#39 // Sigma0(a) ++ eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) ++ add x4,x4,x13 ++ add x23,x23,x27 // d+=h ++ add x27,x27,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ add x4,x4,x10 ++ add x27,x27,x17 // h+=Sigma0(a) ++ add x4,x4,x9 ++ ldr x9,[sp,#16] ++ str x12,[sp,#8] ++ ror x16,x23,#14 ++ add x26,x26,x28 // h+=K[i] ++ ror x11,x6,#1 ++ and x17,x24,x23 ++ ror x10,x3,#19 ++ bic x28,x25,x23 ++ ror x12,x27,#28 ++ add x26,x26,x4 // h+=X[i] ++ eor x16,x16,x23,ror#18 ++ eor x11,x11,x6,ror#8 ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x27,x20 // a^b, b^c in next round ++ eor x16,x16,x23,ror#41 // Sigma1(e) ++ eor x12,x12,x27,ror#34 ++ add x26,x26,x17 // h+=Ch(e,f,g) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ eor x10,x10,x3,ror#61 ++ eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) ++ add x26,x26,x16 // h+=Sigma1(e) ++ eor x19,x19,x20 // Maj(a,b,c) ++ eor x17,x12,x27,ror#39 // Sigma0(a) ++ eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) ++ add x5,x5,x14 ++ add x22,x22,x26 // d+=h ++ add x26,x26,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ add x5,x5,x11 ++ add x26,x26,x17 // h+=Sigma0(a) ++ add x5,x5,x10 ++ ldr x10,[sp,#24] ++ str x13,[sp,#16] ++ ror x16,x22,#14 ++ add x25,x25,x19 // h+=K[i] ++ ror x12,x7,#1 ++ and x17,x23,x22 ++ ror x11,x4,#19 ++ bic x19,x24,x22 ++ ror x13,x26,#28 ++ add x25,x25,x5 // h+=X[i] ++ eor x16,x16,x22,ror#18 ++ eor x12,x12,x7,ror#8 ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x26,x27 // a^b, b^c in next round ++ eor x16,x16,x22,ror#41 // Sigma1(e) ++ eor x13,x13,x26,ror#34 ++ add x25,x25,x17 // h+=Ch(e,f,g) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ eor x11,x11,x4,ror#61 ++ eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) ++ add x25,x25,x16 // h+=Sigma1(e) ++ eor x28,x28,x27 // Maj(a,b,c) ++ eor x17,x13,x26,ror#39 // Sigma0(a) ++ eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) ++ add x6,x6,x15 ++ add x21,x21,x25 // d+=h ++ add x25,x25,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ add x6,x6,x12 ++ add x25,x25,x17 // h+=Sigma0(a) ++ add x6,x6,x11 ++ ldr x11,[sp,#0] ++ str x14,[sp,#24] ++ ror x16,x21,#14 ++ add x24,x24,x28 // h+=K[i] ++ ror x13,x8,#1 ++ and x17,x22,x21 ++ ror x12,x5,#19 ++ bic x28,x23,x21 ++ ror x14,x25,#28 ++ add x24,x24,x6 // h+=X[i] ++ eor x16,x16,x21,ror#18 ++ eor x13,x13,x8,ror#8 ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x25,x26 // a^b, b^c in next round ++ eor x16,x16,x21,ror#41 // Sigma1(e) ++ eor x14,x14,x25,ror#34 ++ add x24,x24,x17 // h+=Ch(e,f,g) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ eor x12,x12,x5,ror#61 ++ eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) ++ add x24,x24,x16 // h+=Sigma1(e) ++ eor x19,x19,x26 // Maj(a,b,c) ++ eor x17,x14,x25,ror#39 // Sigma0(a) ++ eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) ++ add x7,x7,x0 ++ add x20,x20,x24 // d+=h ++ add x24,x24,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ add x7,x7,x13 ++ add x24,x24,x17 // h+=Sigma0(a) ++ add x7,x7,x12 ++ ldr x12,[sp,#8] ++ str x15,[sp,#0] ++ ror x16,x20,#14 ++ add x23,x23,x19 // h+=K[i] ++ ror x14,x9,#1 ++ and x17,x21,x20 ++ ror x13,x6,#19 ++ bic x19,x22,x20 ++ ror x15,x24,#28 ++ add x23,x23,x7 // h+=X[i] ++ eor x16,x16,x20,ror#18 ++ eor x14,x14,x9,ror#8 ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x24,x25 // a^b, b^c in next round ++ eor x16,x16,x20,ror#41 // Sigma1(e) ++ eor x15,x15,x24,ror#34 ++ add x23,x23,x17 // h+=Ch(e,f,g) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ eor x13,x13,x6,ror#61 ++ eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) ++ add x23,x23,x16 // h+=Sigma1(e) ++ eor x28,x28,x25 // Maj(a,b,c) ++ eor x17,x15,x24,ror#39 // Sigma0(a) ++ eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) ++ add x8,x8,x1 ++ add x27,x27,x23 // d+=h ++ add x23,x23,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ add x8,x8,x14 ++ add x23,x23,x17 // h+=Sigma0(a) ++ add x8,x8,x13 ++ ldr x13,[sp,#16] ++ str x0,[sp,#8] ++ ror x16,x27,#14 ++ add x22,x22,x28 // h+=K[i] ++ ror x15,x10,#1 ++ and x17,x20,x27 ++ ror x14,x7,#19 ++ bic x28,x21,x27 ++ ror x0,x23,#28 ++ add x22,x22,x8 // h+=X[i] ++ eor x16,x16,x27,ror#18 ++ eor x15,x15,x10,ror#8 ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x23,x24 // a^b, b^c in next round ++ eor x16,x16,x27,ror#41 // Sigma1(e) ++ eor x0,x0,x23,ror#34 ++ add x22,x22,x17 // h+=Ch(e,f,g) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ eor x14,x14,x7,ror#61 ++ eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) ++ add x22,x22,x16 // h+=Sigma1(e) ++ eor x19,x19,x24 // Maj(a,b,c) ++ eor x17,x0,x23,ror#39 // Sigma0(a) ++ eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) ++ add x9,x9,x2 ++ add x26,x26,x22 // d+=h ++ add x22,x22,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ add x9,x9,x15 ++ add x22,x22,x17 // h+=Sigma0(a) ++ add x9,x9,x14 ++ ldr x14,[sp,#24] ++ str x1,[sp,#16] ++ ror x16,x26,#14 ++ add x21,x21,x19 // h+=K[i] ++ ror x0,x11,#1 ++ and x17,x27,x26 ++ ror x15,x8,#19 ++ bic x19,x20,x26 ++ ror x1,x22,#28 ++ add x21,x21,x9 // h+=X[i] ++ eor x16,x16,x26,ror#18 ++ eor x0,x0,x11,ror#8 ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x22,x23 // a^b, b^c in next round ++ eor x16,x16,x26,ror#41 // Sigma1(e) ++ eor x1,x1,x22,ror#34 ++ add x21,x21,x17 // h+=Ch(e,f,g) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ eor x15,x15,x8,ror#61 ++ eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) ++ add x21,x21,x16 // h+=Sigma1(e) ++ eor x28,x28,x23 // Maj(a,b,c) ++ eor x17,x1,x22,ror#39 // Sigma0(a) ++ eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) ++ add x10,x10,x3 ++ add x25,x25,x21 // d+=h ++ add x21,x21,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ add x10,x10,x0 ++ add x21,x21,x17 // h+=Sigma0(a) ++ add x10,x10,x15 ++ ldr x15,[sp,#0] ++ str x2,[sp,#24] ++ ror x16,x25,#14 ++ add x20,x20,x28 // h+=K[i] ++ ror x1,x12,#1 ++ and x17,x26,x25 ++ ror x0,x9,#19 ++ bic x28,x27,x25 ++ ror x2,x21,#28 ++ add x20,x20,x10 // h+=X[i] ++ eor x16,x16,x25,ror#18 ++ eor x1,x1,x12,ror#8 ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x21,x22 // a^b, b^c in next round ++ eor x16,x16,x25,ror#41 // Sigma1(e) ++ eor x2,x2,x21,ror#34 ++ add x20,x20,x17 // h+=Ch(e,f,g) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ eor x0,x0,x9,ror#61 ++ eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) ++ add x20,x20,x16 // h+=Sigma1(e) ++ eor x19,x19,x22 // Maj(a,b,c) ++ eor x17,x2,x21,ror#39 // Sigma0(a) ++ eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) ++ add x11,x11,x4 ++ add x24,x24,x20 // d+=h ++ add x20,x20,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ add x11,x11,x1 ++ add x20,x20,x17 // h+=Sigma0(a) ++ add x11,x11,x0 ++ ldr x0,[sp,#8] ++ str x3,[sp,#0] ++ ror x16,x24,#14 ++ add x27,x27,x19 // h+=K[i] ++ ror x2,x13,#1 ++ and x17,x25,x24 ++ ror x1,x10,#19 ++ bic x19,x26,x24 ++ ror x3,x20,#28 ++ add x27,x27,x11 // h+=X[i] ++ eor x16,x16,x24,ror#18 ++ eor x2,x2,x13,ror#8 ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x20,x21 // a^b, b^c in next round ++ eor x16,x16,x24,ror#41 // Sigma1(e) ++ eor x3,x3,x20,ror#34 ++ add x27,x27,x17 // h+=Ch(e,f,g) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ eor x1,x1,x10,ror#61 ++ eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) ++ add x27,x27,x16 // h+=Sigma1(e) ++ eor x28,x28,x21 // Maj(a,b,c) ++ eor x17,x3,x20,ror#39 // Sigma0(a) ++ eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) ++ add x12,x12,x5 ++ add x23,x23,x27 // d+=h ++ add x27,x27,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ add x12,x12,x2 ++ add x27,x27,x17 // h+=Sigma0(a) ++ add x12,x12,x1 ++ ldr x1,[sp,#16] ++ str x4,[sp,#8] ++ ror x16,x23,#14 ++ add x26,x26,x28 // h+=K[i] ++ ror x3,x14,#1 ++ and x17,x24,x23 ++ ror x2,x11,#19 ++ bic x28,x25,x23 ++ ror x4,x27,#28 ++ add x26,x26,x12 // h+=X[i] ++ eor x16,x16,x23,ror#18 ++ eor x3,x3,x14,ror#8 ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x27,x20 // a^b, b^c in next round ++ eor x16,x16,x23,ror#41 // Sigma1(e) ++ eor x4,x4,x27,ror#34 ++ add x26,x26,x17 // h+=Ch(e,f,g) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ eor x2,x2,x11,ror#61 ++ eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) ++ add x26,x26,x16 // h+=Sigma1(e) ++ eor x19,x19,x20 // Maj(a,b,c) ++ eor x17,x4,x27,ror#39 // Sigma0(a) ++ eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) ++ add x13,x13,x6 ++ add x22,x22,x26 // d+=h ++ add x26,x26,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ add x13,x13,x3 ++ add x26,x26,x17 // h+=Sigma0(a) ++ add x13,x13,x2 ++ ldr x2,[sp,#24] ++ str x5,[sp,#16] ++ ror x16,x22,#14 ++ add x25,x25,x19 // h+=K[i] ++ ror x4,x15,#1 ++ and x17,x23,x22 ++ ror x3,x12,#19 ++ bic x19,x24,x22 ++ ror x5,x26,#28 ++ add x25,x25,x13 // h+=X[i] ++ eor x16,x16,x22,ror#18 ++ eor x4,x4,x15,ror#8 ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x26,x27 // a^b, b^c in next round ++ eor x16,x16,x22,ror#41 // Sigma1(e) ++ eor x5,x5,x26,ror#34 ++ add x25,x25,x17 // h+=Ch(e,f,g) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ eor x3,x3,x12,ror#61 ++ eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) ++ add x25,x25,x16 // h+=Sigma1(e) ++ eor x28,x28,x27 // Maj(a,b,c) ++ eor x17,x5,x26,ror#39 // Sigma0(a) ++ eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) ++ add x14,x14,x7 ++ add x21,x21,x25 // d+=h ++ add x25,x25,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ add x14,x14,x4 ++ add x25,x25,x17 // h+=Sigma0(a) ++ add x14,x14,x3 ++ ldr x3,[sp,#0] ++ str x6,[sp,#24] ++ ror x16,x21,#14 ++ add x24,x24,x28 // h+=K[i] ++ ror x5,x0,#1 ++ and x17,x22,x21 ++ ror x4,x13,#19 ++ bic x28,x23,x21 ++ ror x6,x25,#28 ++ add x24,x24,x14 // h+=X[i] ++ eor x16,x16,x21,ror#18 ++ eor x5,x5,x0,ror#8 ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x25,x26 // a^b, b^c in next round ++ eor x16,x16,x21,ror#41 // Sigma1(e) ++ eor x6,x6,x25,ror#34 ++ add x24,x24,x17 // h+=Ch(e,f,g) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ eor x4,x4,x13,ror#61 ++ eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) ++ add x24,x24,x16 // h+=Sigma1(e) ++ eor x19,x19,x26 // Maj(a,b,c) ++ eor x17,x6,x25,ror#39 // Sigma0(a) ++ eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) ++ add x15,x15,x8 ++ add x20,x20,x24 // d+=h ++ add x24,x24,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ add x15,x15,x5 ++ add x24,x24,x17 // h+=Sigma0(a) ++ add x15,x15,x4 ++ ldr x4,[sp,#8] ++ str x7,[sp,#0] ++ ror x16,x20,#14 ++ add x23,x23,x19 // h+=K[i] ++ ror x6,x1,#1 ++ and x17,x21,x20 ++ ror x5,x14,#19 ++ bic x19,x22,x20 ++ ror x7,x24,#28 ++ add x23,x23,x15 // h+=X[i] ++ eor x16,x16,x20,ror#18 ++ eor x6,x6,x1,ror#8 ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x24,x25 // a^b, b^c in next round ++ eor x16,x16,x20,ror#41 // Sigma1(e) ++ eor x7,x7,x24,ror#34 ++ add x23,x23,x17 // h+=Ch(e,f,g) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ eor x5,x5,x14,ror#61 ++ eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) ++ add x23,x23,x16 // h+=Sigma1(e) ++ eor x28,x28,x25 // Maj(a,b,c) ++ eor x17,x7,x24,ror#39 // Sigma0(a) ++ eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) ++ add x0,x0,x9 ++ add x27,x27,x23 // d+=h ++ add x23,x23,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ add x0,x0,x6 ++ add x23,x23,x17 // h+=Sigma0(a) ++ add x0,x0,x5 ++ ldr x5,[sp,#16] ++ str x8,[sp,#8] ++ ror x16,x27,#14 ++ add x22,x22,x28 // h+=K[i] ++ ror x7,x2,#1 ++ and x17,x20,x27 ++ ror x6,x15,#19 ++ bic x28,x21,x27 ++ ror x8,x23,#28 ++ add x22,x22,x0 // h+=X[i] ++ eor x16,x16,x27,ror#18 ++ eor x7,x7,x2,ror#8 ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x23,x24 // a^b, b^c in next round ++ eor x16,x16,x27,ror#41 // Sigma1(e) ++ eor x8,x8,x23,ror#34 ++ add x22,x22,x17 // h+=Ch(e,f,g) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ eor x6,x6,x15,ror#61 ++ eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) ++ add x22,x22,x16 // h+=Sigma1(e) ++ eor x19,x19,x24 // Maj(a,b,c) ++ eor x17,x8,x23,ror#39 // Sigma0(a) ++ eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) ++ add x1,x1,x10 ++ add x26,x26,x22 // d+=h ++ add x22,x22,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ add x1,x1,x7 ++ add x22,x22,x17 // h+=Sigma0(a) ++ add x1,x1,x6 ++ ldr x6,[sp,#24] ++ str x9,[sp,#16] ++ ror x16,x26,#14 ++ add x21,x21,x19 // h+=K[i] ++ ror x8,x3,#1 ++ and x17,x27,x26 ++ ror x7,x0,#19 ++ bic x19,x20,x26 ++ ror x9,x22,#28 ++ add x21,x21,x1 // h+=X[i] ++ eor x16,x16,x26,ror#18 ++ eor x8,x8,x3,ror#8 ++ orr x17,x17,x19 // Ch(e,f,g) ++ eor x19,x22,x23 // a^b, b^c in next round ++ eor x16,x16,x26,ror#41 // Sigma1(e) ++ eor x9,x9,x22,ror#34 ++ add x21,x21,x17 // h+=Ch(e,f,g) ++ and x28,x28,x19 // (b^c)&=(a^b) ++ eor x7,x7,x0,ror#61 ++ eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) ++ add x21,x21,x16 // h+=Sigma1(e) ++ eor x28,x28,x23 // Maj(a,b,c) ++ eor x17,x9,x22,ror#39 // Sigma0(a) ++ eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) ++ add x2,x2,x11 ++ add x25,x25,x21 // d+=h ++ add x21,x21,x28 // h+=Maj(a,b,c) ++ ldr x28,[x30],#8 // *K++, x19 in next round ++ add x2,x2,x8 ++ add x21,x21,x17 // h+=Sigma0(a) ++ add x2,x2,x7 ++ ldr x7,[sp,#0] ++ str x10,[sp,#24] ++ ror x16,x25,#14 ++ add x20,x20,x28 // h+=K[i] ++ ror x9,x4,#1 ++ and x17,x26,x25 ++ ror x8,x1,#19 ++ bic x28,x27,x25 ++ ror x10,x21,#28 ++ add x20,x20,x2 // h+=X[i] ++ eor x16,x16,x25,ror#18 ++ eor x9,x9,x4,ror#8 ++ orr x17,x17,x28 // Ch(e,f,g) ++ eor x28,x21,x22 // a^b, b^c in next round ++ eor x16,x16,x25,ror#41 // Sigma1(e) ++ eor x10,x10,x21,ror#34 ++ add x20,x20,x17 // h+=Ch(e,f,g) ++ and x19,x19,x28 // (b^c)&=(a^b) ++ eor x8,x8,x1,ror#61 ++ eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) ++ add x20,x20,x16 // h+=Sigma1(e) ++ eor x19,x19,x22 // Maj(a,b,c) ++ eor x17,x10,x21,ror#39 // Sigma0(a) ++ eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) ++ add x3,x3,x12 ++ add x24,x24,x20 // d+=h ++ add x20,x20,x19 // h+=Maj(a,b,c) ++ ldr x19,[x30],#8 // *K++, x28 in next round ++ add x3,x3,x9 ++ add x20,x20,x17 // h+=Sigma0(a) ++ add x3,x3,x8 ++ cbnz x19,.Loop_16_xx ++ ++ ldp x0,x2,[x29,#96] ++ ldr x1,[x29,#112] ++ sub x30,x30,#648 // rewind ++ ++ ldp x3,x4,[x0] ++ ldp x5,x6,[x0,#2*8] ++ add x1,x1,#14*8 // advance input pointer ++ ldp x7,x8,[x0,#4*8] ++ add x20,x20,x3 ++ ldp x9,x10,[x0,#6*8] ++ add x21,x21,x4 ++ add x22,x22,x5 ++ add x23,x23,x6 ++ stp x20,x21,[x0] ++ add x24,x24,x7 ++ add x25,x25,x8 ++ stp x22,x23,[x0,#2*8] ++ add x26,x26,x9 ++ add x27,x27,x10 ++ cmp x1,x2 ++ stp x24,x25,[x0,#4*8] ++ stp x26,x27,[x0,#6*8] ++ b.ne .Loop ++ ++ ldp x19,x20,[x29,#16] ++ add sp,sp,#4*8 ++ ldp x21,x22,[x29,#32] ++ ldp x23,x24,[x29,#48] ++ ldp x25,x26,[x29,#64] ++ ldp x27,x28,[x29,#80] ++ ldp x29,x30,[sp],#128 ++ ret ++.size sha512_block_data_order,.-sha512_block_data_order ++ ++.align 6 ++.type .LK512,%object ++.LK512: ++ .quad 0x428a2f98d728ae22,0x7137449123ef65cd ++ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc ++ .quad 0x3956c25bf348b538,0x59f111f1b605d019 ++ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 ++ .quad 0xd807aa98a3030242,0x12835b0145706fbe ++ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 ++ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 ++ .quad 0x9bdc06a725c71235,0xc19bf174cf692694 ++ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 ++ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 ++ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 ++ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 ++ .quad 0x983e5152ee66dfab,0xa831c66d2db43210 ++ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 ++ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 ++ .quad 0x06ca6351e003826f,0x142929670a0e6e70 ++ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 ++ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df ++ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 ++ .quad 0x81c2c92e47edaee6,0x92722c851482353b ++ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 ++ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 ++ .quad 0xd192e819d6ef5218,0xd69906245565a910 ++ .quad 0xf40e35855771202a,0x106aa07032bbd1b8 ++ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 ++ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 ++ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb ++ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 ++ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 ++ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec ++ .quad 0x90befffa23631e28,0xa4506cebde82bde9 ++ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b ++ .quad 0xca273eceea26619c,0xd186b8c721c0c207 ++ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 ++ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 ++ .quad 0x113f9804bef90dae,0x1b710b35131c471b ++ .quad 0x28db77f523047d84,0x32caab7b40c72493 ++ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c ++ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a ++ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 ++ .quad 0 // terminator ++.size .LK512,.-.LK512 ++#ifndef __KERNEL__ ++.align 3 ++.LOPENSSL_armcap_P: ++# ifdef __ILP32__ ++ .long OPENSSL_armcap_P-. ++# else ++ .quad OPENSSL_armcap_P-. ++# endif ++#endif ++.asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by " ++.align 2 ++#ifndef __KERNEL__ ++.comm OPENSSL_armcap_P,4,4 ++#endif +diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h +index 7193bf97b8da..e60375ce0dd2 100644 +--- a/arch/arm64/include/asm/assembler.h ++++ b/arch/arm64/include/asm/assembler.h +@@ -86,6 +86,24 @@ + dmb \opt + .endm + ++/* ++ * Value prediction barrier ++ */ ++ .macro csdb ++ hint #20 ++ .endm ++ ++/* ++ * Sanitise a 64-bit bounded index wrt speculation, returning zero if out ++ * of bounds. ++ */ ++ .macro mask_nospec64, idx, limit, tmp ++ sub \tmp, \idx, \limit ++ bic \tmp, \tmp, \idx ++ and \idx, \idx, \tmp, asr #63 ++ csdb ++ .endm ++ + /* + * NOP sequence + */ +@@ -416,4 +434,5 @@ alternative_endif + .macro pte_to_phys, phys, pte + and \phys, \pte, #(((1 << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) + .endm ++ + #endif /* __ASM_ASSEMBLER_H */ +diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h +index 0fe7e43b7fbc..0b0755c961ac 100644 +--- a/arch/arm64/include/asm/barrier.h ++++ b/arch/arm64/include/asm/barrier.h +@@ -31,6 +31,8 @@ + #define dmb(opt) asm volatile("dmb " #opt : : : "memory") + #define dsb(opt) asm volatile("dsb " #opt : : : "memory") + ++#define csdb() asm volatile("hint #20" : : : "memory") ++ + #define mb() dsb(sy) + #define rmb() dsb(ld) + #define wmb() dsb(st) +@@ -38,6 +40,27 @@ + #define dma_rmb() dmb(oshld) + #define dma_wmb() dmb(oshst) + ++/* ++ * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz ++ * and 0 otherwise. ++ */ ++#define array_index_mask_nospec array_index_mask_nospec ++static inline unsigned long array_index_mask_nospec(unsigned long idx, ++ unsigned long sz) ++{ ++ unsigned long mask; ++ ++ asm volatile( ++ " cmp %1, %2\n" ++ " sbc %0, xzr, xzr\n" ++ : "=r" (mask) ++ : "r" (idx), "Ir" (sz) ++ : "cc"); ++ ++ csdb(); ++ return mask; ++} ++ + #define __smp_mb() dmb(ish) + #define __smp_rmb() dmb(ishld) + #define __smp_wmb() dmb(ishst) +diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h +index 7ddf233f05bd..ce67bf6a0886 100644 +--- a/arch/arm64/include/asm/cpucaps.h ++++ b/arch/arm64/include/asm/cpucaps.h +@@ -35,7 +35,8 @@ + #define ARM64_HYP_OFFSET_LOW 14 + #define ARM64_MISMATCHED_CACHE_LINE_SIZE 15 + #define ARM64_UNMAP_KERNEL_AT_EL0 16 ++#define ARM64_HARDEN_BRANCH_PREDICTOR 17 + +-#define ARM64_NCAPS 17 ++#define ARM64_NCAPS 18 + + #endif /* __ASM_CPUCAPS_H */ +diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h +index 1d47930c30dc..9ee3038a6b98 100644 +--- a/arch/arm64/include/asm/cputype.h ++++ b/arch/arm64/include/asm/cputype.h +@@ -75,7 +75,10 @@ + #define ARM_CPU_PART_AEM_V8 0xD0F + #define ARM_CPU_PART_FOUNDATION 0xD00 + #define ARM_CPU_PART_CORTEX_A57 0xD07 ++#define ARM_CPU_PART_CORTEX_A72 0xD08 + #define ARM_CPU_PART_CORTEX_A53 0xD03 ++#define ARM_CPU_PART_CORTEX_A73 0xD09 ++#define ARM_CPU_PART_CORTEX_A75 0xD0A + + #define APM_CPU_PART_POTENZA 0x000 + +@@ -87,6 +90,9 @@ + + #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) + #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) ++#define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) ++#define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73) ++#define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75) + #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) + #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) + #define MIDR_CAVIUM_THUNDERX2 MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX2) +diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h +index 20dcb196b240..4e5f36a804b4 100644 +--- a/arch/arm64/include/asm/futex.h ++++ b/arch/arm64/include/asm/futex.h +@@ -51,13 +51,14 @@ + : "memory") + + static inline int +-futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) ++futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *_uaddr) + { + int op = (encoded_op >> 28) & 7; + int cmp = (encoded_op >> 24) & 15; + int oparg = (int)(encoded_op << 8) >> 20; + int cmparg = (int)(encoded_op << 20) >> 20; + int oldval = 0, ret, tmp; ++ u32 __user *uaddr = __uaccess_mask_ptr(_uaddr); + + if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) + oparg = 1U << (oparg & 0x1f); +@@ -109,15 +110,17 @@ futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) + } + + static inline int +-futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, ++futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, + u32 oldval, u32 newval) + { + int ret = 0; + u32 val, tmp; ++ u32 __user *uaddr; + +- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) ++ if (!access_ok(VERIFY_WRITE, _uaddr, sizeof(u32))) + return -EFAULT; + ++ uaddr = __uaccess_mask_ptr(_uaddr); + asm volatile("// futex_atomic_cmpxchg_inatomic\n" + ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) + " prfm pstl1strm, %2\n" +diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h +index e5050388e062..37d56e85036e 100644 +--- a/arch/arm64/include/asm/kvm_host.h ++++ b/arch/arm64/include/asm/kvm_host.h +@@ -393,4 +393,9 @@ static inline void __cpu_init_stage2(void) + "PARange is %d bits, unsupported configuration!", parange); + } + ++static inline bool kvm_arm_harden_branch_predictor(void) ++{ ++ return cpus_have_cap(ARM64_HARDEN_BRANCH_PREDICTOR); ++} ++ + #endif /* __ARM64_KVM_HOST_H__ */ +diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h +index 6d22017ebbad..80bf33715ecb 100644 +--- a/arch/arm64/include/asm/kvm_mmu.h ++++ b/arch/arm64/include/asm/kvm_mmu.h +@@ -313,5 +313,43 @@ static inline unsigned int kvm_get_vmid_bits(void) + return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8; + } + ++#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR ++#include ++ ++static inline void *kvm_get_hyp_vector(void) ++{ ++ struct bp_hardening_data *data = arm64_get_bp_hardening_data(); ++ void *vect = kvm_ksym_ref(__kvm_hyp_vector); ++ ++ if (data->fn) { ++ vect = __bp_harden_hyp_vecs_start + ++ data->hyp_vectors_slot * SZ_2K; ++ ++ if (!cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN)) ++ vect = lm_alias(vect); ++ } ++ ++ return vect; ++} ++ ++static inline int kvm_map_vectors(void) ++{ ++ return create_hyp_mappings(kvm_ksym_ref(__bp_harden_hyp_vecs_start), ++ kvm_ksym_ref(__bp_harden_hyp_vecs_end), ++ PAGE_HYP_EXEC); ++} ++ ++#else ++static inline void *kvm_get_hyp_vector(void) ++{ ++ return kvm_ksym_ref(__kvm_hyp_vector); ++} ++ ++static inline int kvm_map_vectors(void) ++{ ++ return 0; ++} ++#endif ++ + #endif /* __ASSEMBLY__ */ + #endif /* __ARM64_KVM_MMU_H__ */ +diff --git a/arch/arm64/include/asm/kvm_psci.h b/arch/arm64/include/asm/kvm_psci.h +deleted file mode 100644 +index bc39e557c56c..000000000000 +--- a/arch/arm64/include/asm/kvm_psci.h ++++ /dev/null +@@ -1,27 +0,0 @@ +-/* +- * Copyright (C) 2012,2013 - ARM Ltd +- * Author: Marc Zyngier +- * +- * This program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License version 2 as +- * published by the Free Software Foundation. +- * +- * This program is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- * GNU General Public License for more details. +- * +- * You should have received a copy of the GNU General Public License +- * along with this program. If not, see . +- */ +- +-#ifndef __ARM64_KVM_PSCI_H__ +-#define __ARM64_KVM_PSCI_H__ +- +-#define KVM_ARM_PSCI_0_1 1 +-#define KVM_ARM_PSCI_0_2 2 +- +-int kvm_psci_version(struct kvm_vcpu *vcpu); +-int kvm_psci_call(struct kvm_vcpu *vcpu); +- +-#endif /* __ARM64_KVM_PSCI_H__ */ +diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h +index 5e3faba689e0..ba917be5565a 100644 +--- a/arch/arm64/include/asm/memory.h ++++ b/arch/arm64/include/asm/memory.h +@@ -60,8 +60,6 @@ + * KIMAGE_VADDR - the virtual address of the start of the kernel image + * VA_BITS - the maximum number of bits for virtual addresses. + * VA_START - the first kernel virtual address. +- * TASK_SIZE - the maximum size of a user space task. +- * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area. + */ + #define VA_BITS (CONFIG_ARM64_VA_BITS) + #define VA_START (UL(0xffffffffffffffff) - \ +@@ -76,19 +74,6 @@ + #define PCI_IO_END (VMEMMAP_START - SZ_2M) + #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) + #define FIXADDR_TOP (PCI_IO_START - SZ_2M) +-#define TASK_SIZE_64 (UL(1) << VA_BITS) +- +-#ifdef CONFIG_COMPAT +-#define TASK_SIZE_32 UL(0x100000000) +-#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \ +- TASK_SIZE_32 : TASK_SIZE_64) +-#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ +- TASK_SIZE_32 : TASK_SIZE_64) +-#else +-#define TASK_SIZE TASK_SIZE_64 +-#endif /* CONFIG_COMPAT */ +- +-#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4)) + + #define KERNEL_START _text + #define KERNEL_END _end +diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h +index a813edf28737..d51158a61892 100644 +--- a/arch/arm64/include/asm/mmu.h ++++ b/arch/arm64/include/asm/mmu.h +@@ -20,6 +20,8 @@ + + #ifndef __ASSEMBLY__ + ++#include ++ + typedef struct { + atomic64_t id; + void *vdso; +@@ -38,6 +40,43 @@ static inline bool arm64_kernel_unmapped_at_el0(void) + cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0); + } + ++typedef void (*bp_hardening_cb_t)(void); ++ ++struct bp_hardening_data { ++ int hyp_vectors_slot; ++ bp_hardening_cb_t fn; ++}; ++ ++#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR ++extern char __bp_harden_hyp_vecs_start[], __bp_harden_hyp_vecs_end[]; ++ ++DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); ++ ++static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void) ++{ ++ return this_cpu_ptr(&bp_hardening_data); ++} ++ ++static inline void arm64_apply_bp_hardening(void) ++{ ++ struct bp_hardening_data *d; ++ ++ if (!cpus_have_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) ++ return; ++ ++ d = arm64_get_bp_hardening_data(); ++ if (d->fn) ++ d->fn(); ++} ++#else ++static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void) ++{ ++ return NULL; ++} ++ ++static inline void arm64_apply_bp_hardening(void) { } ++#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ ++ + extern void paging_init(void); + extern void bootmem_init(void); + extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); +diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h +index 60e34824e18c..5917147af0c4 100644 +--- a/arch/arm64/include/asm/processor.h ++++ b/arch/arm64/include/asm/processor.h +@@ -19,6 +19,13 @@ + #ifndef __ASM_PROCESSOR_H + #define __ASM_PROCESSOR_H + ++#define TASK_SIZE_64 (UL(1) << VA_BITS) ++ ++#define KERNEL_DS UL(-1) ++#define USER_DS (TASK_SIZE_64 - 1) ++ ++#ifndef __ASSEMBLY__ ++ + /* + * Default implementation of macro that returns current + * instruction pointer ("program counter"). +@@ -37,6 +44,22 @@ + #include + #include + ++/* ++ * TASK_SIZE - the maximum size of a user space task. ++ * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area. ++ */ ++#ifdef CONFIG_COMPAT ++#define TASK_SIZE_32 UL(0x100000000) ++#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \ ++ TASK_SIZE_32 : TASK_SIZE_64) ++#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ ++ TASK_SIZE_32 : TASK_SIZE_64) ++#else ++#define TASK_SIZE TASK_SIZE_64 ++#endif /* CONFIG_COMPAT */ ++ ++#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4)) ++ + #define STACK_TOP_MAX TASK_SIZE_64 + #ifdef CONFIG_COMPAT + #define AARCH32_VECTORS_BASE 0xffff0000 +@@ -192,4 +215,5 @@ int cpu_enable_pan(void *__unused); + int cpu_enable_uao(void *__unused); + int cpu_enable_cache_maint_trap(void *__unused); + ++#endif /* __ASSEMBLY__ */ + #endif /* __ASM_PROCESSOR_H */ +diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h +index 7cb7f7cdcfbc..88bbe364b6ae 100644 +--- a/arch/arm64/include/asm/sysreg.h ++++ b/arch/arm64/include/asm/sysreg.h +@@ -118,6 +118,8 @@ + + /* id_aa64pfr0 */ + #define ID_AA64PFR0_CSV3_SHIFT 60 ++#define ID_AA64PFR0_CSV2_SHIFT 56 ++#define ID_AA64PFR0_SVE_SHIFT 32 + #define ID_AA64PFR0_GIC_SHIFT 24 + #define ID_AA64PFR0_ASIMD_SHIFT 20 + #define ID_AA64PFR0_FP_SHIFT 16 +diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h +index 811cf16a65f9..1d047d6c421b 100644 +--- a/arch/arm64/include/asm/uaccess.h ++++ b/arch/arm64/include/asm/uaccess.h +@@ -28,6 +28,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -59,16 +60,20 @@ struct exception_table_entry + + extern int fixup_exception(struct pt_regs *regs); + +-#define KERNEL_DS (-1UL) + #define get_ds() (KERNEL_DS) +- +-#define USER_DS TASK_SIZE_64 + #define get_fs() (current_thread_info()->addr_limit) + + static inline void set_fs(mm_segment_t fs) + { + current_thread_info()->addr_limit = fs; + ++ /* ++ * Prevent a mispredicted conditional call to set_fs from forwarding ++ * the wrong address limit to access_ok under speculation. ++ */ ++ dsb(nsh); ++ isb(); ++ + /* + * Enable/disable UAO so that copy_to_user() etc can access + * kernel memory with the unprivileged instructions. +@@ -87,22 +92,32 @@ static inline void set_fs(mm_segment_t fs) + * Returns 1 if the range is valid, 0 otherwise. + * + * This is equivalent to the following test: +- * (u65)addr + (u65)size <= current->addr_limit +- * +- * This needs 65-bit arithmetic. ++ * (u65)addr + (u65)size <= (u65)current->addr_limit + 1 + */ +-#define __range_ok(addr, size) \ +-({ \ +- unsigned long __addr = (unsigned long __force)(addr); \ +- unsigned long flag, roksum; \ +- __chk_user_ptr(addr); \ +- asm("adds %1, %1, %3; ccmp %1, %4, #2, cc; cset %0, ls" \ +- : "=&r" (flag), "=&r" (roksum) \ +- : "1" (__addr), "Ir" (size), \ +- "r" (current_thread_info()->addr_limit) \ +- : "cc"); \ +- flag; \ +-}) ++static inline unsigned long __range_ok(unsigned long addr, unsigned long size) ++{ ++ unsigned long limit = current_thread_info()->addr_limit; ++ ++ __chk_user_ptr(addr); ++ asm volatile( ++ // A + B <= C + 1 for all A,B,C, in four easy steps: ++ // 1: X = A + B; X' = X % 2^64 ++ " adds %0, %0, %2\n" ++ // 2: Set C = 0 if X > 2^64, to guarantee X' > C in step 4 ++ " csel %1, xzr, %1, hi\n" ++ // 3: Set X' = ~0 if X >= 2^64. For X == 2^64, this decrements X' ++ // to compensate for the carry flag being set in step 4. For ++ // X > 2^64, X' merely has to remain nonzero, which it does. ++ " csinv %0, %0, xzr, cc\n" ++ // 4: For X < 2^64, this gives us X' - C - 1 <= 0, where the -1 ++ // comes from the carry in being clear. Otherwise, we are ++ // testing X' - C == 0, subject to the previous adjustments. ++ " sbcs xzr, %0, %1\n" ++ " cset %0, ls\n" ++ : "+r" (addr), "+r" (limit) : "Ir" (size) : "cc"); ++ ++ return addr; ++} + + /* + * When dealing with data aborts, watchpoints, or instruction traps we may end +@@ -111,7 +126,7 @@ static inline void set_fs(mm_segment_t fs) + */ + #define untagged_addr(addr) sign_extend64(addr, 55) + +-#define access_ok(type, addr, size) __range_ok(addr, size) ++#define access_ok(type, addr, size) __range_ok((unsigned long)(addr), size) + #define user_addr_max get_fs + + #define _ASM_EXTABLE(from, to) \ +@@ -120,6 +135,26 @@ static inline void set_fs(mm_segment_t fs) + " .long (" #from " - .), (" #to " - .)\n" \ + " .popsection\n" + ++/* ++ * Sanitise a uaccess pointer such that it becomes NULL if above the ++ * current addr_limit. ++ */ ++#define uaccess_mask_ptr(ptr) (__typeof__(ptr))__uaccess_mask_ptr(ptr) ++static inline void __user *__uaccess_mask_ptr(const void __user *ptr) ++{ ++ void __user *safe_ptr; ++ ++ asm volatile( ++ " bics xzr, %1, %2\n" ++ " csel %0, %1, xzr, eq\n" ++ : "=&r" (safe_ptr) ++ : "r" (ptr), "r" (current_thread_info()->addr_limit) ++ : "cc"); ++ ++ csdb(); ++ return safe_ptr; ++} ++ + /* + * The "__xxx" versions of the user access functions do not verify the address + * space - it must have been done previously with a separate "access_ok()" +@@ -174,30 +209,35 @@ do { \ + CONFIG_ARM64_PAN)); \ + } while (0) + +-#define __get_user(x, ptr) \ ++#define __get_user_check(x, ptr, err) \ + ({ \ +- int __gu_err = 0; \ +- __get_user_err((x), (ptr), __gu_err); \ +- __gu_err; \ ++ __typeof__(*(ptr)) __user *__p = (ptr); \ ++ might_fault(); \ ++ if (access_ok(VERIFY_READ, __p, sizeof(*__p))) { \ ++ __p = uaccess_mask_ptr(__p); \ ++ __get_user_err((x), __p, (err)); \ ++ } else { \ ++ (x) = 0; (err) = -EFAULT; \ ++ } \ + }) + + #define __get_user_error(x, ptr, err) \ + ({ \ +- __get_user_err((x), (ptr), (err)); \ ++ __get_user_check((x), (ptr), (err)); \ + (void)0; \ + }) + +-#define __get_user_unaligned __get_user +- +-#define get_user(x, ptr) \ ++#define __get_user(x, ptr) \ + ({ \ +- __typeof__(*(ptr)) __user *__p = (ptr); \ +- might_fault(); \ +- access_ok(VERIFY_READ, __p, sizeof(*__p)) ? \ +- __get_user((x), __p) : \ +- ((x) = 0, -EFAULT); \ ++ int __gu_err = 0; \ ++ __get_user_check((x), (ptr), __gu_err); \ ++ __gu_err; \ + }) + ++#define __get_user_unaligned __get_user ++ ++#define get_user __get_user ++ + #define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature) \ + asm volatile( \ + "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ +@@ -242,47 +282,51 @@ do { \ + CONFIG_ARM64_PAN)); \ + } while (0) + +-#define __put_user(x, ptr) \ ++#define __put_user_check(x, ptr, err) \ + ({ \ +- int __pu_err = 0; \ +- __put_user_err((x), (ptr), __pu_err); \ +- __pu_err; \ ++ __typeof__(*(ptr)) __user *__p = (ptr); \ ++ might_fault(); \ ++ if (access_ok(VERIFY_WRITE, __p, sizeof(*__p))) { \ ++ __p = uaccess_mask_ptr(__p); \ ++ __put_user_err((x), __p, (err)); \ ++ } else { \ ++ (err) = -EFAULT; \ ++ } \ + }) + + #define __put_user_error(x, ptr, err) \ + ({ \ +- __put_user_err((x), (ptr), (err)); \ ++ __put_user_check((x), (ptr), (err)); \ + (void)0; \ + }) + +-#define __put_user_unaligned __put_user +- +-#define put_user(x, ptr) \ ++#define __put_user(x, ptr) \ + ({ \ +- __typeof__(*(ptr)) __user *__p = (ptr); \ +- might_fault(); \ +- access_ok(VERIFY_WRITE, __p, sizeof(*__p)) ? \ +- __put_user((x), __p) : \ +- -EFAULT; \ ++ int __pu_err = 0; \ ++ __put_user_check((x), (ptr), __pu_err); \ ++ __pu_err; \ + }) + ++#define __put_user_unaligned __put_user ++ ++#define put_user __put_user ++ + extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); + extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n); +-extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n); +-extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n); ++extern unsigned long __must_check __arch_copy_in_user(void __user *to, const void __user *from, unsigned long n); + + static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n) + { + kasan_check_write(to, n); + check_object_size(to, n, false); +- return __arch_copy_from_user(to, from, n); ++ return __arch_copy_from_user(to, __uaccess_mask_ptr(from), n); + } + + static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) + { + kasan_check_read(from, n); + check_object_size(from, n, true); +- return __arch_copy_to_user(to, from, n); ++ return __arch_copy_to_user(__uaccess_mask_ptr(to), from, n); + } + + static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) +@@ -310,22 +354,25 @@ static inline unsigned long __must_check copy_to_user(void __user *to, const voi + return n; + } + +-static inline unsigned long __must_check copy_in_user(void __user *to, const void __user *from, unsigned long n) ++static inline unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n) + { + if (access_ok(VERIFY_READ, from, n) && access_ok(VERIFY_WRITE, to, n)) +- n = __copy_in_user(to, from, n); ++ n = __arch_copy_in_user(__uaccess_mask_ptr(to), __uaccess_mask_ptr(from), n); + return n; + } ++#define copy_in_user __copy_in_user + + #define __copy_to_user_inatomic __copy_to_user + #define __copy_from_user_inatomic __copy_from_user + +-static inline unsigned long __must_check clear_user(void __user *to, unsigned long n) ++extern unsigned long __must_check __arch_clear_user(void __user *to, unsigned long n); ++static inline unsigned long __must_check __clear_user(void __user *to, unsigned long n) + { + if (access_ok(VERIFY_WRITE, to, n)) +- n = __clear_user(to, n); ++ n = __arch_clear_user(__uaccess_mask_ptr(to), n); + return n; + } ++#define clear_user __clear_user + + extern long strncpy_from_user(char *dest, const char __user *src, long count); + +diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile +index 7d66bbaafc0c..74b8fd860714 100644 +--- a/arch/arm64/kernel/Makefile ++++ b/arch/arm64/kernel/Makefile +@@ -51,6 +51,10 @@ arm64-obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o + arm64-obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o \ + cpu-reset.o + ++ifeq ($(CONFIG_KVM),y) ++arm64-obj-$(CONFIG_HARDEN_BRANCH_PREDICTOR) += bpi.o ++endif ++ + obj-y += $(arm64-obj-y) vdso/ probes/ + obj-m += $(arm64-obj-m) + head-y := head.o +diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c +index e9c4dc9e0ada..66be504edb6c 100644 +--- a/arch/arm64/kernel/arm64ksyms.c ++++ b/arch/arm64/kernel/arm64ksyms.c +@@ -37,8 +37,8 @@ EXPORT_SYMBOL(clear_page); + /* user mem (segment) */ + EXPORT_SYMBOL(__arch_copy_from_user); + EXPORT_SYMBOL(__arch_copy_to_user); +-EXPORT_SYMBOL(__clear_user); +-EXPORT_SYMBOL(__copy_in_user); ++EXPORT_SYMBOL(__arch_clear_user); ++EXPORT_SYMBOL(__arch_copy_in_user); + + /* physical memory */ + EXPORT_SYMBOL(memstart_addr); +diff --git a/arch/arm64/kernel/bpi.S b/arch/arm64/kernel/bpi.S +new file mode 100644 +index 000000000000..dc4eb154e33b +--- /dev/null ++++ b/arch/arm64/kernel/bpi.S +@@ -0,0 +1,75 @@ ++/* ++ * Contains CPU specific branch predictor invalidation sequences ++ * ++ * Copyright (C) 2018 ARM Ltd. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++#include ++#include ++ ++.macro ventry target ++ .rept 31 ++ nop ++ .endr ++ b \target ++.endm ++ ++.macro vectors target ++ ventry \target + 0x000 ++ ventry \target + 0x080 ++ ventry \target + 0x100 ++ ventry \target + 0x180 ++ ++ ventry \target + 0x200 ++ ventry \target + 0x280 ++ ventry \target + 0x300 ++ ventry \target + 0x380 ++ ++ ventry \target + 0x400 ++ ventry \target + 0x480 ++ ventry \target + 0x500 ++ ventry \target + 0x580 ++ ++ ventry \target + 0x600 ++ ventry \target + 0x680 ++ ventry \target + 0x700 ++ ventry \target + 0x780 ++.endm ++ ++ .align 11 ++ENTRY(__bp_harden_hyp_vecs_start) ++ .rept 4 ++ vectors __kvm_hyp_vector ++ .endr ++ENTRY(__bp_harden_hyp_vecs_end) ++ ++.macro smccc_workaround_1 inst ++ sub sp, sp, #(8 * 4) ++ stp x2, x3, [sp, #(8 * 0)] ++ stp x0, x1, [sp, #(8 * 2)] ++ mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1 ++ \inst #0 ++ ldp x2, x3, [sp, #(8 * 0)] ++ ldp x0, x1, [sp, #(8 * 2)] ++ add sp, sp, #(8 * 4) ++.endm ++ ++ENTRY(__smccc_workaround_1_smc_start) ++ smccc_workaround_1 smc ++ENTRY(__smccc_workaround_1_smc_end) ++ ++ENTRY(__smccc_workaround_1_hvc_start) ++ smccc_workaround_1 hvc ++ENTRY(__smccc_workaround_1_hvc_end) +diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c +index b75e917aac46..74107134cc30 100644 +--- a/arch/arm64/kernel/cpu_errata.c ++++ b/arch/arm64/kernel/cpu_errata.c +@@ -46,6 +46,147 @@ static int cpu_enable_trap_ctr_access(void *__unused) + return 0; + } + ++#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR ++#include ++#include ++ ++DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); ++ ++#ifdef CONFIG_KVM ++extern char __smccc_workaround_1_smc_start[]; ++extern char __smccc_workaround_1_smc_end[]; ++extern char __smccc_workaround_1_hvc_start[]; ++extern char __smccc_workaround_1_hvc_end[]; ++ ++static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, ++ const char *hyp_vecs_end) ++{ ++ void *dst = __bp_harden_hyp_vecs_start + slot * SZ_2K; ++ int i; ++ ++ for (i = 0; i < SZ_2K; i += 0x80) ++ memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start); ++ ++ flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); ++} ++ ++static void __install_bp_hardening_cb(bp_hardening_cb_t fn, ++ const char *hyp_vecs_start, ++ const char *hyp_vecs_end) ++{ ++ static int last_slot = -1; ++ static DEFINE_SPINLOCK(bp_lock); ++ int cpu, slot = -1; ++ ++ spin_lock(&bp_lock); ++ for_each_possible_cpu(cpu) { ++ if (per_cpu(bp_hardening_data.fn, cpu) == fn) { ++ slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu); ++ break; ++ } ++ } ++ ++ if (slot == -1) { ++ last_slot++; ++ BUG_ON(((__bp_harden_hyp_vecs_end - __bp_harden_hyp_vecs_start) ++ / SZ_2K) <= last_slot); ++ slot = last_slot; ++ __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end); ++ } ++ ++ __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); ++ __this_cpu_write(bp_hardening_data.fn, fn); ++ spin_unlock(&bp_lock); ++} ++#else ++#define __smccc_workaround_1_smc_start NULL ++#define __smccc_workaround_1_smc_end NULL ++#define __smccc_workaround_1_hvc_start NULL ++#define __smccc_workaround_1_hvc_end NULL ++ ++static void __install_bp_hardening_cb(bp_hardening_cb_t fn, ++ const char *hyp_vecs_start, ++ const char *hyp_vecs_end) ++{ ++ __this_cpu_write(bp_hardening_data.fn, fn); ++} ++#endif /* CONFIG_KVM */ ++ ++static void install_bp_hardening_cb(const struct arm64_cpu_capabilities *entry, ++ bp_hardening_cb_t fn, ++ const char *hyp_vecs_start, ++ const char *hyp_vecs_end) ++{ ++ u64 pfr0; ++ ++ if (!entry->matches(entry, SCOPE_LOCAL_CPU)) ++ return; ++ ++ pfr0 = read_cpuid(ID_AA64PFR0_EL1); ++ if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT)) ++ return; ++ ++ __install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end); ++} ++ ++#include ++#include ++#include ++ ++static void call_smc_arch_workaround_1(void) ++{ ++ arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); ++} ++ ++static void call_hvc_arch_workaround_1(void) ++{ ++ arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); ++} ++ ++static int enable_smccc_arch_workaround_1(void *data) ++{ ++ const struct arm64_cpu_capabilities *entry = data; ++ bp_hardening_cb_t cb; ++ void *smccc_start, *smccc_end; ++ struct arm_smccc_res res; ++ ++ if (!entry->matches(entry, SCOPE_LOCAL_CPU)) ++ return 0; ++ ++ if (psci_ops.smccc_version == SMCCC_VERSION_1_0) ++ return 0; ++ ++ switch (psci_ops.conduit) { ++ case PSCI_CONDUIT_HVC: ++ arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ++ ARM_SMCCC_ARCH_WORKAROUND_1, &res); ++ if (res.a0) ++ return 0; ++ cb = call_hvc_arch_workaround_1; ++ smccc_start = __smccc_workaround_1_hvc_start; ++ smccc_end = __smccc_workaround_1_hvc_end; ++ break; ++ ++ case PSCI_CONDUIT_SMC: ++ arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ++ ARM_SMCCC_ARCH_WORKAROUND_1, &res); ++ if (res.a0) ++ return 0; ++ cb = call_smc_arch_workaround_1; ++ smccc_start = __smccc_workaround_1_smc_start; ++ smccc_end = __smccc_workaround_1_smc_end; ++ break; ++ ++ default: ++ return 0; ++ } ++ ++ install_bp_hardening_cb(entry, cb, smccc_start, smccc_end); ++ ++ return 0; ++} ++#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ ++ + #define MIDR_RANGE(model, min, max) \ + .def_scope = SCOPE_LOCAL_CPU, \ + .matches = is_affected_midr_range, \ +@@ -53,6 +194,13 @@ static int cpu_enable_trap_ctr_access(void *__unused) + .midr_range_min = min, \ + .midr_range_max = max + ++#define MIDR_ALL_VERSIONS(model) \ ++ .def_scope = SCOPE_LOCAL_CPU, \ ++ .matches = is_affected_midr_range, \ ++ .midr_model = model, \ ++ .midr_range_min = 0, \ ++ .midr_range_max = (MIDR_VARIANT_MASK | MIDR_REVISION_MASK) ++ + const struct arm64_cpu_capabilities arm64_errata[] = { + #if defined(CONFIG_ARM64_ERRATUM_826319) || \ + defined(CONFIG_ARM64_ERRATUM_827319) || \ +@@ -130,6 +278,38 @@ const struct arm64_cpu_capabilities arm64_errata[] = { + .def_scope = SCOPE_LOCAL_CPU, + .enable = cpu_enable_trap_ctr_access, + }, ++#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR ++ { ++ .capability = ARM64_HARDEN_BRANCH_PREDICTOR, ++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), ++ .enable = enable_smccc_arch_workaround_1, ++ }, ++ { ++ .capability = ARM64_HARDEN_BRANCH_PREDICTOR, ++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), ++ .enable = enable_smccc_arch_workaround_1, ++ }, ++ { ++ .capability = ARM64_HARDEN_BRANCH_PREDICTOR, ++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), ++ .enable = enable_smccc_arch_workaround_1, ++ }, ++ { ++ .capability = ARM64_HARDEN_BRANCH_PREDICTOR, ++ MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), ++ .enable = enable_smccc_arch_workaround_1, ++ }, ++ { ++ .capability = ARM64_HARDEN_BRANCH_PREDICTOR, ++ MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), ++ .enable = enable_smccc_arch_workaround_1, ++ }, ++ { ++ .capability = ARM64_HARDEN_BRANCH_PREDICTOR, ++ MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), ++ .enable = enable_smccc_arch_workaround_1, ++ }, ++#endif + { + } + }; +@@ -143,15 +323,18 @@ void verify_local_cpu_errata_workarounds(void) + { + const struct arm64_cpu_capabilities *caps = arm64_errata; + +- for (; caps->matches; caps++) +- if (!cpus_have_cap(caps->capability) && +- caps->matches(caps, SCOPE_LOCAL_CPU)) { ++ for (; caps->matches; caps++) { ++ if (cpus_have_cap(caps->capability)) { ++ if (caps->enable) ++ caps->enable((void *)caps); ++ } else if (caps->matches(caps, SCOPE_LOCAL_CPU)) { + pr_crit("CPU%d: Requires work around for %s, not detected" + " at boot time\n", + smp_processor_id(), + caps->desc ? : "an erratum"); + cpu_die_early(); + } ++ } + } + + void update_cpu_errata_workarounds(void) +diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c +index 5056fc597ae9..a0ee01202503 100644 +--- a/arch/arm64/kernel/cpufeature.c ++++ b/arch/arm64/kernel/cpufeature.c +@@ -94,7 +94,8 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { + + static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { + ARM64_FTR_BITS(FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0), +- ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 28, 0), ++ ARM64_FTR_BITS(FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0), ++ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 24, 0), + ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 28, 4, 0), + ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64PFR0_GIC_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), +@@ -1024,9 +1025,8 @@ static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array, + if (WARN_ON(preemptible())) + return false; + +- for (caps = cap_array; caps->desc; caps++) ++ for (caps = cap_array; caps->matches; caps++) + if (caps->capability == cap && +- caps->matches && + caps->matches(caps, SCOPE_LOCAL_CPU)) + return true; + return false; +@@ -1059,7 +1059,7 @@ void __init enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) + * uses an IPI, giving us a PSTATE that disappears when + * we return. + */ +- stop_machine(caps->enable, NULL, cpu_online_mask); ++ stop_machine(caps->enable, (void *)caps, cpu_online_mask); + } + + /* +@@ -1116,7 +1116,7 @@ verify_local_cpu_features(const struct arm64_cpu_capabilities *caps_list) + cpu_die_early(); + } + if (caps->enable) +- caps->enable(NULL); ++ caps->enable((void *)caps); + } + } + +diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S +index 8d1600b18562..b79e302d2a3e 100644 +--- a/arch/arm64/kernel/entry.S ++++ b/arch/arm64/kernel/entry.S +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -125,10 +126,10 @@ alternative_else_nop_endif + .else + add x21, sp, #S_FRAME_SIZE + get_thread_info tsk +- /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */ ++ /* Save the task's original addr_limit and set USER_DS */ + ldr x20, [tsk, #TI_ADDR_LIMIT] + str x20, [sp, #S_ORIG_ADDR_LIMIT] +- mov x20, #TASK_SIZE_64 ++ mov x20, #USER_DS + str x20, [tsk, #TI_ADDR_LIMIT] + /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */ + .endif /* \el == 0 */ +@@ -588,13 +589,15 @@ el0_ia: + * Instruction abort handling + */ + mrs x26, far_el1 +- // enable interrupts before calling the main handler +- enable_dbg_and_irq ++ msr daifclr, #(8 | 4 | 1) ++#ifdef CONFIG_TRACE_IRQFLAGS ++ bl trace_hardirqs_off ++#endif + ct_user_exit + mov x0, x26 + mov x1, x25 + mov x2, sp +- bl do_mem_abort ++ bl do_el0_ia_bp_hardening + b ret_to_user + el0_fpsimd_acc: + /* +@@ -621,8 +624,10 @@ el0_sp_pc: + * Stack or PC alignment exception handling + */ + mrs x26, far_el1 +- // enable interrupts before calling the main handler +- enable_dbg_and_irq ++ enable_dbg ++#ifdef CONFIG_TRACE_IRQFLAGS ++ bl trace_hardirqs_off ++#endif + ct_user_exit + mov x0, x26 + mov x1, x25 +@@ -681,6 +686,11 @@ el0_irq_naked: + #endif + + ct_user_exit ++#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR ++ tbz x22, #55, 1f ++ bl do_el0_irq_bp_hardening ++1: ++#endif + irq_handler + + #ifdef CONFIG_TRACE_IRQFLAGS +@@ -794,6 +804,7 @@ el0_svc_naked: // compat entry point + b.ne __sys_trace + cmp scno, sc_nr // check upper syscall limit + b.hs ni_sys ++ mask_nospec64 scno, sc_nr, x19 // enforce bounds for syscall number + ldr x16, [stbl, scno, lsl #3] // address in the syscall table + blr x16 // call sys_* routine + b ret_fast_syscall +diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c +index 2e6e9e99977b..efe43c5f2dc1 100644 +--- a/arch/arm64/kvm/handle_exit.c ++++ b/arch/arm64/kvm/handle_exit.c +@@ -22,12 +22,15 @@ + #include + #include + ++#include ++ + #include + #include + #include + #include + #include +-#include ++#include ++#include + + #define CREATE_TRACE_POINTS + #include "trace.h" +@@ -42,7 +45,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) + kvm_vcpu_hvc_get_imm(vcpu)); + vcpu->stat.hvc_exit_stat++; + +- ret = kvm_psci_call(vcpu); ++ ret = kvm_hvc_call_handler(vcpu); + if (ret < 0) { + vcpu_set_reg(vcpu, 0, ~0UL); + return 1; +@@ -53,7 +56,16 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) + + static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) + { ++ /* ++ * "If an SMC instruction executed at Non-secure EL1 is ++ * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a ++ * Trap exception, not a Secure Monitor Call exception [...]" ++ * ++ * We need to advance the PC after the trap, as it would ++ * otherwise return to the same address... ++ */ + vcpu_set_reg(vcpu, 0, ~0UL); ++ kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + return 1; + } + +diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S +index 4e92399f7105..4e9d50c3e658 100644 +--- a/arch/arm64/kvm/hyp/hyp-entry.S ++++ b/arch/arm64/kvm/hyp/hyp-entry.S +@@ -15,6 +15,7 @@ + * along with this program. If not, see . + */ + ++#include + #include + + #include +@@ -79,10 +80,11 @@ alternative_endif + lsr x0, x1, #ESR_ELx_EC_SHIFT + + cmp x0, #ESR_ELx_EC_HVC64 ++ ccmp x0, #ESR_ELx_EC_HVC32, #4, ne + b.ne el1_trap + +- mrs x1, vttbr_el2 // If vttbr is valid, the 64bit guest +- cbnz x1, el1_trap // called HVC ++ mrs x1, vttbr_el2 // If vttbr is valid, the guest ++ cbnz x1, el1_hvc_guest // called HVC + + /* Here, we're pretty sure the host called HVC. */ + ldp x0, x1, [sp], #16 +@@ -101,6 +103,20 @@ alternative_endif + + 2: eret + ++el1_hvc_guest: ++ /* ++ * Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1. ++ * The workaround has already been applied on the host, ++ * so let's quickly get back to the guest. We don't bother ++ * restoring x1, as it can be clobbered anyway. ++ */ ++ ldr x1, [sp] // Guest's x0 ++ eor w1, w1, #ARM_SMCCC_ARCH_WORKAROUND_1 ++ cbnz w1, el1_trap ++ mov x0, x1 ++ add sp, sp, #16 ++ eret ++ + el1_trap: + /* + * x0: ESR_EC +diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c +index 9174ba917d65..c49d09387192 100644 +--- a/arch/arm64/kvm/hyp/switch.c ++++ b/arch/arm64/kvm/hyp/switch.c +@@ -17,6 +17,9 @@ + + #include + #include ++#include ++ ++#include + + #include + #include +@@ -50,7 +53,7 @@ static void __hyp_text __activate_traps_vhe(void) + val &= ~CPACR_EL1_FPEN; + write_sysreg(val, cpacr_el1); + +- write_sysreg(__kvm_hyp_vector, vbar_el1); ++ write_sysreg(kvm_get_hyp_vector(), vbar_el1); + } + + static void __hyp_text __activate_traps_nvhe(void) +diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S +index 5d1cad3ce6d6..efbf610eaf4e 100644 +--- a/arch/arm64/lib/clear_user.S ++++ b/arch/arm64/lib/clear_user.S +@@ -24,7 +24,7 @@ + + .text + +-/* Prototype: int __clear_user(void *addr, size_t sz) ++/* Prototype: int __arch_clear_user(void *addr, size_t sz) + * Purpose : clear some user memory + * Params : addr - user memory address to clear + * : sz - number of bytes to clear +@@ -32,7 +32,7 @@ + * + * Alignment fixed up by hardware. + */ +-ENTRY(__clear_user) ++ENTRY(__arch_clear_user) + ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ + CONFIG_ARM64_PAN) + mov x2, x1 // save the size for fixup return +@@ -57,7 +57,7 @@ uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 + ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ + CONFIG_ARM64_PAN) + ret +-ENDPROC(__clear_user) ++ENDPROC(__arch_clear_user) + + .section .fixup,"ax" + .align 2 +diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S +index f7292dd08c84..841bf8f7fab7 100644 +--- a/arch/arm64/lib/copy_in_user.S ++++ b/arch/arm64/lib/copy_in_user.S +@@ -67,7 +67,7 @@ + .endm + + end .req x5 +-ENTRY(__copy_in_user) ++ENTRY(__arch_copy_in_user) + ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ + CONFIG_ARM64_PAN) + add end, x0, x2 +@@ -76,7 +76,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ + CONFIG_ARM64_PAN) + mov x0, #0 + ret +-ENDPROC(__copy_in_user) ++ENDPROC(__arch_copy_in_user) + + .section .fixup,"ax" + .align 2 +diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c +index f00f5eeb556f..62d976e843fc 100644 +--- a/arch/arm64/mm/context.c ++++ b/arch/arm64/mm/context.c +@@ -230,9 +230,21 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) + raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); + + switch_mm_fastpath: ++ ++ arm64_apply_bp_hardening(); ++ + cpu_switch_mm(mm->pgd, mm); + } + ++/* Errata workaround post TTBRx_EL1 update. */ ++asmlinkage void post_ttbr_update_workaround(void) ++{ ++ asm(ALTERNATIVE("nop; nop; nop", ++ "ic iallu; dsb nsh; isb", ++ ARM64_WORKAROUND_CAVIUM_27456, ++ CONFIG_CAVIUM_ERRATUM_27456)); ++} ++ + static int asids_init(void) + { + asid_bits = get_cpu_asid_bits(); +diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c +index 403fe9e57135..ad49ae8f3967 100644 +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -332,7 +332,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, + mm_flags |= FAULT_FLAG_WRITE; + } + +- if (is_permission_fault(esr) && (addr < USER_DS)) { ++ if (is_permission_fault(esr) && (addr < TASK_SIZE)) { + /* regs->orig_addr_limit may be 0 if we entered from EL0 */ + if (regs->orig_addr_limit == KERNEL_DS) + die("Accessing user space memory with fs=KERNEL_DS", regs, esr); +@@ -590,6 +590,29 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr, + arm64_notify_die("", regs, &info, esr); + } + ++asmlinkage void __exception do_el0_irq_bp_hardening(void) ++{ ++ /* PC has already been checked in entry.S */ ++ arm64_apply_bp_hardening(); ++} ++ ++asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr, ++ unsigned int esr, ++ struct pt_regs *regs) ++{ ++ /* ++ * We've taken an instruction abort from userspace and not yet ++ * re-enabled IRQs. If the address is a kernel address, apply ++ * BP hardening prior to enabling IRQs and pre-emption. ++ */ ++ if (addr > TASK_SIZE) ++ arm64_apply_bp_hardening(); ++ ++ local_irq_enable(); ++ do_mem_abort(addr, esr, regs); ++} ++ ++ + /* + * Handle stack alignment exceptions. + */ +@@ -600,6 +623,12 @@ asmlinkage void __exception do_sp_pc_abort(unsigned long addr, + struct siginfo info; + struct task_struct *tsk = current; + ++ if (user_mode(regs)) { ++ if (instruction_pointer(regs) > TASK_SIZE) ++ arm64_apply_bp_hardening(); ++ local_irq_enable(); ++ } ++ + if (show_unhandled_signals && unhandled_signal(tsk, SIGBUS)) + pr_info_ratelimited("%s[%d]: %s exception: pc=%p sp=%p\n", + tsk->comm, task_pid_nr(tsk), +@@ -659,6 +688,9 @@ asmlinkage int __exception do_debug_exception(unsigned long addr, + if (interrupts_enabled(regs)) + trace_hardirqs_off(); + ++ if (user_mode(regs) && instruction_pointer(regs) > TASK_SIZE) ++ arm64_apply_bp_hardening(); ++ + if (!inf->fn(addr, esr, regs)) { + rv = 1; + } else { +diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S +index c07d9cc057e6..619da1cbd32b 100644 +--- a/arch/arm64/mm/proc.S ++++ b/arch/arm64/mm/proc.S +@@ -139,12 +139,7 @@ ENTRY(cpu_do_switch_mm) + isb + msr ttbr0_el1, x0 // now update TTBR0 + isb +-alternative_if ARM64_WORKAROUND_CAVIUM_27456 +- ic iallu +- dsb nsh +- isb +-alternative_else_nop_endif +- ret ++ b post_ttbr_update_workaround // Back to C code... + ENDPROC(cpu_do_switch_mm) + + .pushsection ".idmap.text", "awx" +diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c +index 700e2d2da096..2e68ca1fe0db 100644 +--- a/arch/parisc/kernel/drivers.c ++++ b/arch/parisc/kernel/drivers.c +@@ -648,6 +648,10 @@ static int match_pci_device(struct device *dev, int index, + (modpath->mod == PCI_FUNC(devfn))); + } + ++ /* index might be out of bounds for bc[] */ ++ if (index >= 6) ++ return 0; ++ + id = PCI_SLOT(pdev->devfn) | (PCI_FUNC(pdev->devfn) << 5); + return (modpath->bc[index] == id); + } +diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c +index 295bfb7124bc..39127b691b78 100644 +--- a/arch/s390/kernel/ipl.c ++++ b/arch/s390/kernel/ipl.c +@@ -798,6 +798,7 @@ static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, + /* copy and convert to ebcdic */ + memcpy(ipb->hdr.loadparm, buf, lp_len); + ASCEBC(ipb->hdr.loadparm, LOADPARM_LEN); ++ ipb->hdr.flags |= DIAG308_FLAGS_LP_VALID; + return len; + } + +diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c +index b1815b20a99c..37032545c58e 100644 +--- a/drivers/acpi/nfit/core.c ++++ b/drivers/acpi/nfit/core.c +@@ -2547,15 +2547,21 @@ static void acpi_nfit_scrub(struct work_struct *work) + static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc) + { + struct nfit_spa *nfit_spa; +- int rc; + +- list_for_each_entry(nfit_spa, &acpi_desc->spas, list) +- if (nfit_spa_type(nfit_spa->spa) == NFIT_SPA_DCR) { +- /* BLK regions don't need to wait for ars results */ +- rc = acpi_nfit_register_region(acpi_desc, nfit_spa); +- if (rc) +- return rc; +- } ++ list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { ++ int rc, type = nfit_spa_type(nfit_spa->spa); ++ ++ /* PMEM and VMEM will be registered by the ARS workqueue */ ++ if (type == NFIT_SPA_PM || type == NFIT_SPA_VOLATILE) ++ continue; ++ /* BLK apertures belong to BLK region registration below */ ++ if (type == NFIT_SPA_BDW) ++ continue; ++ /* BLK regions don't need to wait for ARS results */ ++ rc = acpi_nfit_register_region(acpi_desc, nfit_spa); ++ if (rc) ++ return rc; ++ } + + queue_work(nfit_wq, &acpi_desc->work); + return 0; +diff --git a/drivers/block/loop.c b/drivers/block/loop.c +index dc318b9100c2..ff1c4d7aa025 100644 +--- a/drivers/block/loop.c ++++ b/drivers/block/loop.c +@@ -1110,11 +1110,15 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) + if (info->lo_encrypt_type) { + unsigned int type = info->lo_encrypt_type; + +- if (type >= MAX_LO_CRYPT) +- return -EINVAL; ++ if (type >= MAX_LO_CRYPT) { ++ err = -EINVAL; ++ goto exit; ++ } + xfer = xfer_funcs[type]; +- if (xfer == NULL) +- return -EINVAL; ++ if (xfer == NULL) { ++ err = -EINVAL; ++ goto exit; ++ } + } else + xfer = NULL; + +diff --git a/drivers/firmware/psci.c b/drivers/firmware/psci.c +index 8263429e21b8..79a48c37fb35 100644 +--- a/drivers/firmware/psci.c ++++ b/drivers/firmware/psci.c +@@ -59,7 +59,10 @@ bool psci_tos_resident_on(int cpu) + return cpu == resident_cpu; + } + +-struct psci_operations psci_ops; ++struct psci_operations psci_ops = { ++ .conduit = PSCI_CONDUIT_NONE, ++ .smccc_version = SMCCC_VERSION_1_0, ++}; + + typedef unsigned long (psci_fn)(unsigned long, unsigned long, + unsigned long, unsigned long); +@@ -210,6 +213,22 @@ static unsigned long psci_migrate_info_up_cpu(void) + 0, 0, 0); + } + ++static void set_conduit(enum psci_conduit conduit) ++{ ++ switch (conduit) { ++ case PSCI_CONDUIT_HVC: ++ invoke_psci_fn = __invoke_psci_fn_hvc; ++ break; ++ case PSCI_CONDUIT_SMC: ++ invoke_psci_fn = __invoke_psci_fn_smc; ++ break; ++ default: ++ WARN(1, "Unexpected PSCI conduit %d\n", conduit); ++ } ++ ++ psci_ops.conduit = conduit; ++} ++ + static int get_set_conduit_method(struct device_node *np) + { + const char *method; +@@ -222,9 +241,9 @@ static int get_set_conduit_method(struct device_node *np) + } + + if (!strcmp("hvc", method)) { +- invoke_psci_fn = __invoke_psci_fn_hvc; ++ set_conduit(PSCI_CONDUIT_HVC); + } else if (!strcmp("smc", method)) { +- invoke_psci_fn = __invoke_psci_fn_smc; ++ set_conduit(PSCI_CONDUIT_SMC); + } else { + pr_warn("invalid \"method\" property: %s\n", method); + return -EINVAL; +@@ -493,9 +512,36 @@ static void __init psci_init_migrate(void) + pr_info("Trusted OS resident on physical CPU 0x%lx\n", cpuid); + } + ++static void __init psci_init_smccc(void) ++{ ++ u32 ver = ARM_SMCCC_VERSION_1_0; ++ int feature; ++ ++ feature = psci_features(ARM_SMCCC_VERSION_FUNC_ID); ++ ++ if (feature != PSCI_RET_NOT_SUPPORTED) { ++ u32 ret; ++ ret = invoke_psci_fn(ARM_SMCCC_VERSION_FUNC_ID, 0, 0, 0); ++ if (ret == ARM_SMCCC_VERSION_1_1) { ++ psci_ops.smccc_version = SMCCC_VERSION_1_1; ++ ver = ret; ++ } ++ } ++ ++ /* ++ * Conveniently, the SMCCC and PSCI versions are encoded the ++ * same way. No, this isn't accidental. ++ */ ++ pr_info("SMC Calling Convention v%d.%d\n", ++ PSCI_VERSION_MAJOR(ver), PSCI_VERSION_MINOR(ver)); ++ ++} ++ + static void __init psci_0_2_set_functions(void) + { + pr_info("Using standard PSCI v0.2 function IDs\n"); ++ psci_ops.get_version = psci_get_version; ++ + psci_function_id[PSCI_FN_CPU_SUSPEND] = + PSCI_FN_NATIVE(0_2, CPU_SUSPEND); + psci_ops.cpu_suspend = psci_cpu_suspend; +@@ -539,6 +585,7 @@ static int __init psci_probe(void) + psci_init_migrate(); + + if (PSCI_VERSION_MAJOR(ver) >= 1) { ++ psci_init_smccc(); + psci_init_cpu_suspend(); + psci_init_system_suspend(); + } +@@ -652,9 +699,9 @@ int __init psci_acpi_init(void) + pr_info("probing for conduit method from ACPI.\n"); + + if (acpi_psci_use_hvc()) +- invoke_psci_fn = __invoke_psci_fn_hvc; ++ set_conduit(PSCI_CONDUIT_HVC); + else +- invoke_psci_fn = __invoke_psci_fn_smc; ++ set_conduit(PSCI_CONDUIT_SMC); + + return psci_probe(); + } +diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c +index 41b72ce6613f..83e1345db9e2 100644 +--- a/drivers/gpu/drm/radeon/radeon_object.c ++++ b/drivers/gpu/drm/radeon/radeon_object.c +@@ -238,9 +238,10 @@ int radeon_bo_create(struct radeon_device *rdev, + * may be slow + * See https://bugs.freedesktop.org/show_bug.cgi?id=88758 + */ +- ++#ifndef CONFIG_COMPILE_TEST + #warning Please enable CONFIG_MTRR and CONFIG_X86_PAT for better performance \ + thanks to write-combining ++#endif + + if (bo->flags & RADEON_GEM_GTT_WC) + DRM_INFO_ONCE("Please enable CONFIG_MTRR and CONFIG_X86_PAT for " +diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c +index d8bc4b910192..9360cdce740e 100644 +--- a/drivers/hv/channel_mgmt.c ++++ b/drivers/hv/channel_mgmt.c +@@ -70,7 +70,7 @@ static const struct vmbus_device vmbus_devs[] = { + /* PCIE */ + { .dev_type = HV_PCIE, + HV_PCIE_GUID, +- .perf_device = true, ++ .perf_device = false, + }, + + /* Synthetic Frame Buffer */ +diff --git a/drivers/hwmon/ina2xx.c b/drivers/hwmon/ina2xx.c +index a629f7c130f0..ac63e562071f 100644 +--- a/drivers/hwmon/ina2xx.c ++++ b/drivers/hwmon/ina2xx.c +@@ -447,6 +447,7 @@ static int ina2xx_probe(struct i2c_client *client, + + /* set the device type */ + data->config = &ina2xx_config[id->driver_data]; ++ mutex_init(&data->config_lock); + + if (of_property_read_u32(dev->of_node, "shunt-resistor", &val) < 0) { + struct ina2xx_platform_data *pdata = dev_get_platdata(dev); +@@ -473,8 +474,6 @@ static int ina2xx_probe(struct i2c_client *client, + return -ENODEV; + } + +- mutex_init(&data->config_lock); +- + data->groups[group++] = &ina2xx_group; + if (id->driver_data == ina226) + data->groups[group++] = &ina226_group; +diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +index 48a39222fdf9..a9fc64557c53 100644 +--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c ++++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +@@ -101,7 +101,7 @@ static int get_v4l2_window32(struct v4l2_window __user *kp, + static int put_v4l2_window32(struct v4l2_window __user *kp, + struct v4l2_window32 __user *up) + { +- struct v4l2_clip __user *kclips = kp->clips; ++ struct v4l2_clip __user *kclips; + struct v4l2_clip32 __user *uclips; + compat_caddr_t p; + u32 clipcount; +@@ -116,6 +116,8 @@ static int put_v4l2_window32(struct v4l2_window __user *kp, + if (!clipcount) + return 0; + ++ if (get_user(kclips, &kp->clips)) ++ return -EFAULT; + if (get_user(p, &up->clips)) + return -EFAULT; + uclips = compat_ptr(p); +diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c +index 4da73e2c37cf..2032a6de026b 100644 +--- a/drivers/net/phy/micrel.c ++++ b/drivers/net/phy/micrel.c +@@ -268,12 +268,23 @@ static int kszphy_nand_tree_disable(struct phy_device *phydev) + return ret; + } + +-/* Some config bits need to be set again on resume, handle them here. */ +-static int kszphy_config_reset(struct phy_device *phydev) ++static int kszphy_config_init(struct phy_device *phydev) + { + struct kszphy_priv *priv = phydev->priv; ++ const struct kszphy_type *type; + int ret; + ++ if (!priv) ++ return 0; ++ ++ type = priv->type; ++ ++ if (type->has_broadcast_disable) ++ kszphy_broadcast_disable(phydev); ++ ++ if (type->has_nand_tree_disable) ++ kszphy_nand_tree_disable(phydev); ++ + if (priv->rmii_ref_clk_sel) { + ret = kszphy_rmii_clk_sel(phydev, priv->rmii_ref_clk_sel_val); + if (ret) { +@@ -284,7 +295,7 @@ static int kszphy_config_reset(struct phy_device *phydev) + } + + if (priv->led_mode >= 0) +- kszphy_setup_led(phydev, priv->type->led_mode_reg, priv->led_mode); ++ kszphy_setup_led(phydev, type->led_mode_reg, priv->led_mode); + + if (phy_interrupt_is_valid(phydev)) { + int ctl = phy_read(phydev, MII_BMCR); +@@ -300,25 +311,6 @@ static int kszphy_config_reset(struct phy_device *phydev) + return 0; + } + +-static int kszphy_config_init(struct phy_device *phydev) +-{ +- struct kszphy_priv *priv = phydev->priv; +- const struct kszphy_type *type; +- +- if (!priv) +- return 0; +- +- type = priv->type; +- +- if (type->has_broadcast_disable) +- kszphy_broadcast_disable(phydev); +- +- if (type->has_nand_tree_disable) +- kszphy_nand_tree_disable(phydev); +- +- return kszphy_config_reset(phydev); +-} +- + static int ksz8041_config_init(struct phy_device *phydev) + { + struct device_node *of_node = phydev->mdio.dev.of_node; +@@ -723,14 +715,8 @@ static int kszphy_suspend(struct phy_device *phydev) + + static int kszphy_resume(struct phy_device *phydev) + { +- int ret; +- + genphy_resume(phydev); + +- ret = kszphy_config_reset(phydev); +- if (ret) +- return ret; +- + /* Enable PHY Interrupts */ + if (phy_interrupt_is_valid(phydev)) { + phydev->interrupts = PHY_INTERRUPT_ENABLED; +diff --git a/drivers/net/slip/slhc.c b/drivers/net/slip/slhc.c +index 27ed25252aac..cfd81eb1b532 100644 +--- a/drivers/net/slip/slhc.c ++++ b/drivers/net/slip/slhc.c +@@ -509,6 +509,10 @@ slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize) + if(x < 0 || x > comp->rslot_limit) + goto bad; + ++ /* Check if the cstate is initialized */ ++ if (!comp->rstate[x].initialized) ++ goto bad; ++ + comp->flags &=~ SLF_TOSS; + comp->recv_current = x; + } else { +@@ -673,6 +677,7 @@ slhc_remember(struct slcompress *comp, unsigned char *icp, int isize) + if (cs->cs_tcp.doff > 5) + memcpy(cs->cs_tcpopt, icp + ihl*4 + sizeof(struct tcphdr), (cs->cs_tcp.doff - 5) * 4); + cs->cs_hsize = ihl*2 + cs->cs_tcp.doff*2; ++ cs->initialized = true; + /* Put headers back on packet + * Neither header checksum is recalculated + */ +diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c +index 1fca0024f294..4fb468666b19 100644 +--- a/drivers/net/usb/cdc_ether.c ++++ b/drivers/net/usb/cdc_ether.c +@@ -773,6 +773,12 @@ static const struct usb_device_id products[] = { + USB_CDC_SUBCLASS_ETHERNET, + USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_info, ++}, { ++ /* Cinterion AHS3 modem by GEMALTO */ ++ USB_DEVICE_AND_INTERFACE_INFO(0x1e2d, 0x0055, USB_CLASS_COMM, ++ USB_CDC_SUBCLASS_ETHERNET, ++ USB_CDC_PROTO_NONE), ++ .driver_info = (unsigned long)&wwan_info, + }, { + /* Telit modules */ + USB_VENDOR_AND_INTERFACE_INFO(0x1bc7, USB_CLASS_COMM, +diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c +index c53385a0052f..f5a96678494b 100644 +--- a/drivers/net/usb/lan78xx.c ++++ b/drivers/net/usb/lan78xx.c +@@ -873,7 +873,8 @@ static int lan78xx_read_otp(struct lan78xx_net *dev, u32 offset, + offset += 0x100; + else + ret = -EINVAL; +- ret = lan78xx_read_raw_otp(dev, offset, length, data); ++ if (!ret) ++ ret = lan78xx_read_raw_otp(dev, offset, length, data); + } + + return ret; +diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c +index 231f84db9ab0..6113624ccec3 100644 +--- a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c ++++ b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c +@@ -1454,6 +1454,7 @@ static int rtl8187_probe(struct usb_interface *intf, + goto err_free_dev; + } + mutex_init(&priv->io_mutex); ++ mutex_init(&priv->conf_mutex); + + SET_IEEE80211_DEV(dev, &intf->dev); + usb_set_intfdata(intf, dev); +@@ -1627,7 +1628,6 @@ static int rtl8187_probe(struct usb_interface *intf, + printk(KERN_ERR "rtl8187: Cannot register device\n"); + goto err_free_dmabuf; + } +- mutex_init(&priv->conf_mutex); + skb_queue_head_init(&priv->b_tx_status.queue); + + wiphy_info(dev->wiphy, "hwaddr %pM, %s V%d + %s, rfkill mask %d\n", +diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c +index 71bf9bded485..66e9bb053629 100644 +--- a/drivers/s390/cio/qdio_main.c ++++ b/drivers/s390/cio/qdio_main.c +@@ -126,7 +126,7 @@ static inline int qdio_check_ccq(struct qdio_q *q, unsigned int ccq) + static int qdio_do_eqbs(struct qdio_q *q, unsigned char *state, + int start, int count, int auto_ack) + { +- int rc, tmp_count = count, tmp_start = start, nr = q->nr, retried = 0; ++ int rc, tmp_count = count, tmp_start = start, nr = q->nr; + unsigned int ccq = 0; + + qperf_inc(q, eqbs); +@@ -149,14 +149,7 @@ static int qdio_do_eqbs(struct qdio_q *q, unsigned char *state, + qperf_inc(q, eqbs_partial); + DBF_DEV_EVENT(DBF_WARN, q->irq_ptr, "EQBS part:%02x", + tmp_count); +- /* +- * Retry once, if that fails bail out and process the +- * extracted buffers before trying again. +- */ +- if (!retried++) +- goto again; +- else +- return count - tmp_count; ++ return count - tmp_count; + } + + DBF_ERROR("%4x EQBS ERROR", SCH_NO(q)); +@@ -212,7 +205,10 @@ static int qdio_do_sqbs(struct qdio_q *q, unsigned char state, int start, + return 0; + } + +-/* returns number of examined buffers and their common state in *state */ ++/* ++ * Returns number of examined buffers and their common state in *state. ++ * Requested number of buffers-to-examine must be > 0. ++ */ + static inline int get_buf_states(struct qdio_q *q, unsigned int bufnr, + unsigned char *state, unsigned int count, + int auto_ack, int merge_pending) +@@ -223,17 +219,23 @@ static inline int get_buf_states(struct qdio_q *q, unsigned int bufnr, + if (is_qebsm(q)) + return qdio_do_eqbs(q, state, bufnr, count, auto_ack); + +- for (i = 0; i < count; i++) { +- if (!__state) { +- __state = q->slsb.val[bufnr]; +- if (merge_pending && __state == SLSB_P_OUTPUT_PENDING) +- __state = SLSB_P_OUTPUT_EMPTY; +- } else if (merge_pending) { +- if ((q->slsb.val[bufnr] & __state) != __state) +- break; +- } else if (q->slsb.val[bufnr] != __state) +- break; ++ /* get initial state: */ ++ __state = q->slsb.val[bufnr]; ++ if (merge_pending && __state == SLSB_P_OUTPUT_PENDING) ++ __state = SLSB_P_OUTPUT_EMPTY; ++ ++ for (i = 1; i < count; i++) { + bufnr = next_buf(bufnr); ++ ++ /* merge PENDING into EMPTY: */ ++ if (merge_pending && ++ q->slsb.val[bufnr] == SLSB_P_OUTPUT_PENDING && ++ __state == SLSB_P_OUTPUT_EMPTY) ++ continue; ++ ++ /* stop if next state differs from initial state: */ ++ if (q->slsb.val[bufnr] != __state) ++ break; + } + *state = __state; + return i; +diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c +index e2c37aeed45a..fce49ebc575d 100644 +--- a/drivers/vhost/vhost.c ++++ b/drivers/vhost/vhost.c +@@ -1175,10 +1175,12 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq, + /* Caller should have vq mutex and device mutex */ + int vhost_vq_access_ok(struct vhost_virtqueue *vq) + { +- int ret = vq_log_access_ok(vq, vq->log_base); ++ if (!vq_log_access_ok(vq, vq->log_base)) ++ return 0; + +- if (ret || vq->iotlb) +- return ret; ++ /* Access validation occurs at prefetch time with IOTLB */ ++ if (vq->iotlb) ++ return 1; + + return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); + } +diff --git a/fs/namei.c b/fs/namei.c +index 891670e0956b..85ac38b99065 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -221,9 +221,10 @@ getname_kernel(const char * filename) + if (len <= EMBEDDED_NAME_MAX) { + result->name = (char *)result->iname; + } else if (len <= PATH_MAX) { ++ const size_t size = offsetof(struct filename, iname[1]); + struct filename *tmp; + +- tmp = kmalloc(sizeof(*tmp), GFP_KERNEL); ++ tmp = kmalloc(size, GFP_KERNEL); + if (unlikely(!tmp)) { + __putname(result); + return ERR_PTR(-ENOMEM); +diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h +new file mode 100644 +index 000000000000..e518e4e3dfb5 +--- /dev/null ++++ b/include/kvm/arm_psci.h +@@ -0,0 +1,51 @@ ++/* ++ * Copyright (C) 2012,2013 - ARM Ltd ++ * Author: Marc Zyngier ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see . ++ */ ++ ++#ifndef __KVM_ARM_PSCI_H__ ++#define __KVM_ARM_PSCI_H__ ++ ++#include ++#include ++ ++#define KVM_ARM_PSCI_0_1 PSCI_VERSION(0, 1) ++#define KVM_ARM_PSCI_0_2 PSCI_VERSION(0, 2) ++#define KVM_ARM_PSCI_1_0 PSCI_VERSION(1, 0) ++ ++#define KVM_ARM_PSCI_LATEST KVM_ARM_PSCI_1_0 ++ ++/* ++ * We need the KVM pointer independently from the vcpu as we can call ++ * this from HYP, and need to apply kern_hyp_va on it... ++ */ ++static inline int kvm_psci_version(struct kvm_vcpu *vcpu, struct kvm *kvm) ++{ ++ /* ++ * Our PSCI implementation stays the same across versions from ++ * v0.2 onward, only adding the few mandatory functions (such ++ * as FEATURES with 1.0) that are required by newer ++ * revisions. It is thus safe to return the latest. ++ */ ++ if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features)) ++ return KVM_ARM_PSCI_LATEST; ++ ++ return KVM_ARM_PSCI_0_1; ++} ++ ++ ++int kvm_hvc_call_handler(struct kvm_vcpu *vcpu); ++ ++#endif /* __KVM_ARM_PSCI_H__ */ +diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h +index 4c5bca38c653..a031897fca76 100644 +--- a/include/linux/arm-smccc.h ++++ b/include/linux/arm-smccc.h +@@ -14,14 +14,16 @@ + #ifndef __LINUX_ARM_SMCCC_H + #define __LINUX_ARM_SMCCC_H + ++#include ++ + /* + * This file provides common defines for ARM SMC Calling Convention as + * specified in + * http://infocenter.arm.com/help/topic/com.arm.doc.den0028a/index.html + */ + +-#define ARM_SMCCC_STD_CALL 0 +-#define ARM_SMCCC_FAST_CALL 1 ++#define ARM_SMCCC_STD_CALL _AC(0,U) ++#define ARM_SMCCC_FAST_CALL _AC(1,U) + #define ARM_SMCCC_TYPE_SHIFT 31 + + #define ARM_SMCCC_SMC_32 0 +@@ -60,6 +62,24 @@ + #define ARM_SMCCC_QUIRK_NONE 0 + #define ARM_SMCCC_QUIRK_QCOM_A6 1 /* Save/restore register a6 */ + ++#define ARM_SMCCC_VERSION_1_0 0x10000 ++#define ARM_SMCCC_VERSION_1_1 0x10001 ++ ++#define ARM_SMCCC_VERSION_FUNC_ID \ ++ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ++ ARM_SMCCC_SMC_32, \ ++ 0, 0) ++ ++#define ARM_SMCCC_ARCH_FEATURES_FUNC_ID \ ++ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ++ ARM_SMCCC_SMC_32, \ ++ 0, 1) ++ ++#define ARM_SMCCC_ARCH_WORKAROUND_1 \ ++ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ++ ARM_SMCCC_SMC_32, \ ++ 0, 0x8000) ++ + #ifndef __ASSEMBLY__ + + #include +@@ -130,5 +150,146 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, + + #define arm_smccc_hvc_quirk(...) __arm_smccc_hvc(__VA_ARGS__) + ++/* SMCCC v1.1 implementation madness follows */ ++#ifdef CONFIG_ARM64 ++ ++#define SMCCC_SMC_INST "smc #0" ++#define SMCCC_HVC_INST "hvc #0" ++ ++#elif defined(CONFIG_ARM) ++#include ++#include ++ ++#define SMCCC_SMC_INST __SMC(0) ++#define SMCCC_HVC_INST __HVC(0) ++ ++#endif ++ ++#define ___count_args(_0, _1, _2, _3, _4, _5, _6, _7, _8, x, ...) x ++ ++#define __count_args(...) \ ++ ___count_args(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1, 0) ++ ++#define __constraint_write_0 \ ++ "+r" (r0), "=&r" (r1), "=&r" (r2), "=&r" (r3) ++#define __constraint_write_1 \ ++ "+r" (r0), "+r" (r1), "=&r" (r2), "=&r" (r3) ++#define __constraint_write_2 \ ++ "+r" (r0), "+r" (r1), "+r" (r2), "=&r" (r3) ++#define __constraint_write_3 \ ++ "+r" (r0), "+r" (r1), "+r" (r2), "+r" (r3) ++#define __constraint_write_4 __constraint_write_3 ++#define __constraint_write_5 __constraint_write_4 ++#define __constraint_write_6 __constraint_write_5 ++#define __constraint_write_7 __constraint_write_6 ++ ++#define __constraint_read_0 ++#define __constraint_read_1 ++#define __constraint_read_2 ++#define __constraint_read_3 ++#define __constraint_read_4 "r" (r4) ++#define __constraint_read_5 __constraint_read_4, "r" (r5) ++#define __constraint_read_6 __constraint_read_5, "r" (r6) ++#define __constraint_read_7 __constraint_read_6, "r" (r7) ++ ++#define __declare_arg_0(a0, res) \ ++ struct arm_smccc_res *___res = res; \ ++ register u32 r0 asm("r0") = a0; \ ++ register unsigned long r1 asm("r1"); \ ++ register unsigned long r2 asm("r2"); \ ++ register unsigned long r3 asm("r3") ++ ++#define __declare_arg_1(a0, a1, res) \ ++ struct arm_smccc_res *___res = res; \ ++ register u32 r0 asm("r0") = a0; \ ++ register typeof(a1) r1 asm("r1") = a1; \ ++ register unsigned long r2 asm("r2"); \ ++ register unsigned long r3 asm("r3") ++ ++#define __declare_arg_2(a0, a1, a2, res) \ ++ struct arm_smccc_res *___res = res; \ ++ register u32 r0 asm("r0") = a0; \ ++ register typeof(a1) r1 asm("r1") = a1; \ ++ register typeof(a2) r2 asm("r2") = a2; \ ++ register unsigned long r3 asm("r3") ++ ++#define __declare_arg_3(a0, a1, a2, a3, res) \ ++ struct arm_smccc_res *___res = res; \ ++ register u32 r0 asm("r0") = a0; \ ++ register typeof(a1) r1 asm("r1") = a1; \ ++ register typeof(a2) r2 asm("r2") = a2; \ ++ register typeof(a3) r3 asm("r3") = a3 ++ ++#define __declare_arg_4(a0, a1, a2, a3, a4, res) \ ++ __declare_arg_3(a0, a1, a2, a3, res); \ ++ register typeof(a4) r4 asm("r4") = a4 ++ ++#define __declare_arg_5(a0, a1, a2, a3, a4, a5, res) \ ++ __declare_arg_4(a0, a1, a2, a3, a4, res); \ ++ register typeof(a5) r5 asm("r5") = a5 ++ ++#define __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res) \ ++ __declare_arg_5(a0, a1, a2, a3, a4, a5, res); \ ++ register typeof(a6) r6 asm("r6") = a6 ++ ++#define __declare_arg_7(a0, a1, a2, a3, a4, a5, a6, a7, res) \ ++ __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res); \ ++ register typeof(a7) r7 asm("r7") = a7 ++ ++#define ___declare_args(count, ...) __declare_arg_ ## count(__VA_ARGS__) ++#define __declare_args(count, ...) ___declare_args(count, __VA_ARGS__) ++ ++#define ___constraints(count) \ ++ : __constraint_write_ ## count \ ++ : __constraint_read_ ## count \ ++ : "memory" ++#define __constraints(count) ___constraints(count) ++ ++/* ++ * We have an output list that is not necessarily used, and GCC feels ++ * entitled to optimise the whole sequence away. "volatile" is what ++ * makes it stick. ++ */ ++#define __arm_smccc_1_1(inst, ...) \ ++ do { \ ++ __declare_args(__count_args(__VA_ARGS__), __VA_ARGS__); \ ++ asm volatile(inst "\n" \ ++ __constraints(__count_args(__VA_ARGS__))); \ ++ if (___res) \ ++ *___res = (typeof(*___res)){r0, r1, r2, r3}; \ ++ } while (0) ++ ++/* ++ * arm_smccc_1_1_smc() - make an SMCCC v1.1 compliant SMC call ++ * ++ * This is a variadic macro taking one to eight source arguments, and ++ * an optional return structure. ++ * ++ * @a0-a7: arguments passed in registers 0 to 7 ++ * @res: result values from registers 0 to 3 ++ * ++ * This macro is used to make SMC calls following SMC Calling Convention v1.1. ++ * The content of the supplied param are copied to registers 0 to 7 prior ++ * to the SMC instruction. The return values are updated with the content ++ * from register 0 to 3 on return from the SMC instruction if not NULL. ++ */ ++#define arm_smccc_1_1_smc(...) __arm_smccc_1_1(SMCCC_SMC_INST, __VA_ARGS__) ++ ++/* ++ * arm_smccc_1_1_hvc() - make an SMCCC v1.1 compliant HVC call ++ * ++ * This is a variadic macro taking one to eight source arguments, and ++ * an optional return structure. ++ * ++ * @a0-a7: arguments passed in registers 0 to 7 ++ * @res: result values from registers 0 to 3 ++ * ++ * This macro is used to make HVC calls following SMC Calling Convention v1.1. ++ * The content of the supplied param are copied to registers 0 to 7 prior ++ * to the HVC instruction. The return values are updated with the content ++ * from register 0 to 3 on return from the HVC instruction if not NULL. ++ */ ++#define arm_smccc_1_1_hvc(...) __arm_smccc_1_1(SMCCC_HVC_INST, __VA_ARGS__) ++ + #endif /*__ASSEMBLY__*/ + #endif /*__LINUX_ARM_SMCCC_H*/ +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 8e506783631b..4a07ff4f38e1 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -76,6 +76,10 @@ extern int mmap_rnd_compat_bits __read_mostly; + #define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x))) + #endif + ++#ifndef lm_alias ++#define lm_alias(x) __va(__pa_symbol(x)) ++#endif ++ + /* + * To prevent common memory management code establishing + * a zero page mapping on a read fault. +diff --git a/include/linux/psci.h b/include/linux/psci.h +index bdea1cb5e1db..347077cf19c6 100644 +--- a/include/linux/psci.h ++++ b/include/linux/psci.h +@@ -25,7 +25,19 @@ bool psci_tos_resident_on(int cpu); + int psci_cpu_init_idle(unsigned int cpu); + int psci_cpu_suspend_enter(unsigned long index); + ++enum psci_conduit { ++ PSCI_CONDUIT_NONE, ++ PSCI_CONDUIT_SMC, ++ PSCI_CONDUIT_HVC, ++}; ++ ++enum smccc_version { ++ SMCCC_VERSION_1_0, ++ SMCCC_VERSION_1_1, ++}; ++ + struct psci_operations { ++ u32 (*get_version)(void); + int (*cpu_suspend)(u32 state, unsigned long entry_point); + int (*cpu_off)(u32 state); + int (*cpu_on)(unsigned long cpuid, unsigned long entry_point); +@@ -33,6 +45,8 @@ struct psci_operations { + int (*affinity_info)(unsigned long target_affinity, + unsigned long lowest_affinity_level); + int (*migrate_info_type)(void); ++ enum psci_conduit conduit; ++ enum smccc_version smccc_version; + }; + + extern struct psci_operations psci_ops; +diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h +index 554671c81f4a..4931787193c3 100644 +--- a/include/net/bluetooth/hci_core.h ++++ b/include/net/bluetooth/hci_core.h +@@ -893,7 +893,7 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, + u16 conn_timeout); + struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, + u8 dst_type, u8 sec_level, u16 conn_timeout, +- u8 role); ++ u8 role, bdaddr_t *direct_rpa); + struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, + u8 sec_level, u8 auth_type); + struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, +diff --git a/include/net/slhc_vj.h b/include/net/slhc_vj.h +index 8716d5942b65..8fcf8908a694 100644 +--- a/include/net/slhc_vj.h ++++ b/include/net/slhc_vj.h +@@ -127,6 +127,7 @@ typedef __u32 int32; + */ + struct cstate { + byte_t cs_this; /* connection id number (xmit) */ ++ bool initialized; /* true if initialized */ + struct cstate *next; /* next in ring (xmit) */ + struct iphdr cs_ip; /* ip/tcp hdr from most recent packet */ + struct tcphdr cs_tcp; +diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h +index 3d7a0fc021a7..39930ca998cd 100644 +--- a/include/uapi/linux/psci.h ++++ b/include/uapi/linux/psci.h +@@ -87,6 +87,9 @@ + (((ver) & PSCI_VERSION_MAJOR_MASK) >> PSCI_VERSION_MAJOR_SHIFT) + #define PSCI_VERSION_MINOR(ver) \ + ((ver) & PSCI_VERSION_MINOR_MASK) ++#define PSCI_VERSION(maj, min) \ ++ ((((maj) << PSCI_VERSION_MAJOR_SHIFT) & PSCI_VERSION_MAJOR_MASK) | \ ++ ((min) & PSCI_VERSION_MINOR_MASK)) + + /* PSCI features decoding (>=1.0) */ + #define PSCI_1_0_FEATURES_CPU_SUSPEND_PF_SHIFT 1 +diff --git a/kernel/events/core.c b/kernel/events/core.c +index c4100c38a467..74710fad35d5 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -4091,6 +4091,9 @@ static void _free_event(struct perf_event *event) + if (event->ctx) + put_ctx(event->ctx); + ++ if (event->hw.target) ++ put_task_struct(event->hw.target); ++ + exclusive_event_destroy(event); + module_put(event->pmu->module); + +@@ -9214,6 +9217,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, + * and we cannot use the ctx information because we need the + * pmu before we get a ctx. + */ ++ get_task_struct(task); + event->hw.target = task; + } + +@@ -9331,6 +9335,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, + perf_detach_cgroup(event); + if (event->ns) + put_pid_ns(event->ns); ++ if (event->hw.target) ++ put_task_struct(event->hw.target); + kfree(event); + + return ERR_PTR(err); +diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c +index dc59eae54717..cc061495f653 100644 +--- a/net/bluetooth/hci_conn.c ++++ b/net/bluetooth/hci_conn.c +@@ -749,18 +749,31 @@ static bool conn_use_rpa(struct hci_conn *conn) + } + + static void hci_req_add_le_create_conn(struct hci_request *req, +- struct hci_conn *conn) ++ struct hci_conn *conn, ++ bdaddr_t *direct_rpa) + { + struct hci_cp_le_create_conn cp; + struct hci_dev *hdev = conn->hdev; + u8 own_addr_type; + +- /* Update random address, but set require_privacy to false so +- * that we never connect with an non-resolvable address. ++ /* If direct address was provided we use it instead of current ++ * address. + */ +- if (hci_update_random_address(req, false, conn_use_rpa(conn), +- &own_addr_type)) +- return; ++ if (direct_rpa) { ++ if (bacmp(&req->hdev->random_addr, direct_rpa)) ++ hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, ++ direct_rpa); ++ ++ /* direct address is always RPA */ ++ own_addr_type = ADDR_LE_DEV_RANDOM; ++ } else { ++ /* Update random address, but set require_privacy to false so ++ * that we never connect with an non-resolvable address. ++ */ ++ if (hci_update_random_address(req, false, conn_use_rpa(conn), ++ &own_addr_type)) ++ return; ++ } + + memset(&cp, 0, sizeof(cp)); + +@@ -825,7 +838,7 @@ static void hci_req_directed_advertising(struct hci_request *req, + + struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, + u8 dst_type, u8 sec_level, u16 conn_timeout, +- u8 role) ++ u8 role, bdaddr_t *direct_rpa) + { + struct hci_conn_params *params; + struct hci_conn *conn; +@@ -940,7 +953,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, + hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED); + } + +- hci_req_add_le_create_conn(&req, conn); ++ hci_req_add_le_create_conn(&req, conn, direct_rpa); + + create_conn: + err = hci_req_run(&req, create_le_conn_complete); +diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c +index e17aacbc5630..d2f9eb169ba8 100644 +--- a/net/bluetooth/hci_event.c ++++ b/net/bluetooth/hci_event.c +@@ -4646,7 +4646,8 @@ static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, + /* This function requires the caller holds hdev->lock */ + static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, + bdaddr_t *addr, +- u8 addr_type, u8 adv_type) ++ u8 addr_type, u8 adv_type, ++ bdaddr_t *direct_rpa) + { + struct hci_conn *conn; + struct hci_conn_params *params; +@@ -4697,7 +4698,8 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, + } + + conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW, +- HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER); ++ HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER, ++ direct_rpa); + if (!IS_ERR(conn)) { + /* If HCI_AUTO_CONN_EXPLICIT is set, conn is already owned + * by higher layer that tried to connect, if no then +@@ -4807,8 +4809,13 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, + bdaddr_type = irk->addr_type; + } + +- /* Check if we have been requested to connect to this device */ +- conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, type); ++ /* Check if we have been requested to connect to this device. ++ * ++ * direct_addr is set only for directed advertising reports (it is NULL ++ * for advertising reports) and is already verified to be RPA above. ++ */ ++ conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, type, ++ direct_addr); + if (conn && type == LE_ADV_IND) { + /* Store report for later inclusion by + * mgmt_device_connected +diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c +index 2bbca23a9d05..1fc23cb4a3e0 100644 +--- a/net/bluetooth/l2cap_core.c ++++ b/net/bluetooth/l2cap_core.c +@@ -7148,7 +7148,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, + hcon = hci_connect_le(hdev, dst, dst_type, + chan->sec_level, + HCI_LE_CONN_TIMEOUT, +- HCI_ROLE_SLAVE); ++ HCI_ROLE_SLAVE, NULL); + else + hcon = hci_connect_le_scan(hdev, dst, dst_type, + chan->sec_level, +diff --git a/net/rds/send.c b/net/rds/send.c +index ef53d164e146..50241d30e16d 100644 +--- a/net/rds/send.c ++++ b/net/rds/send.c +@@ -1,5 +1,5 @@ + /* +- * Copyright (c) 2006 Oracle. All rights reserved. ++ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU +@@ -983,10 +983,15 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn) + if (conn->c_npaths == 0 && hash != 0) { + rds_send_ping(conn); + +- if (conn->c_npaths == 0) { +- wait_event_interruptible(conn->c_hs_waitq, +- (conn->c_npaths != 0)); +- } ++ /* The underlying connection is not up yet. Need to wait ++ * until it is up to be sure that the non-zero c_path can be ++ * used. But if we are interrupted, we have to use the zero ++ * c_path in case the connection ends up being non-MP capable. ++ */ ++ if (conn->c_npaths == 0) ++ if (wait_event_interruptible(conn->c_hs_waitq, ++ conn->c_npaths != 0)) ++ hash = 0; + if (conn->c_npaths == 1) + hash = 0; + } +diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c +index 79aec90259cd..4afd4149a632 100644 +--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c ++++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c +@@ -237,9 +237,6 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, + + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + +- err = crypto_ahash_init(req); +- if (err) +- goto out; + err = crypto_ahash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength); + if (err) + goto out; +diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c +index 150334064071..ff5bc6363a79 100644 +--- a/tools/perf/tests/code-reading.c ++++ b/tools/perf/tests/code-reading.c +@@ -224,8 +224,6 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, + unsigned char buf2[BUFSZ]; + size_t ret_len; + u64 objdump_addr; +- const char *objdump_name; +- char decomp_name[KMOD_DECOMP_LEN]; + int ret; + + pr_debug("Reading object code for memory address: %#"PRIx64"\n", addr); +@@ -286,25 +284,9 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, + state->done[state->done_cnt++] = al.map->start; + } + +- objdump_name = al.map->dso->long_name; +- if (dso__needs_decompress(al.map->dso)) { +- if (dso__decompress_kmodule_path(al.map->dso, objdump_name, +- decomp_name, +- sizeof(decomp_name)) < 0) { +- pr_debug("decompression failed\n"); +- return -1; +- } +- +- objdump_name = decomp_name; +- } +- + /* Read the object code using objdump */ + objdump_addr = map__rip_2objdump(al.map, al.addr); +- ret = read_via_objdump(objdump_name, objdump_addr, buf2, len); +- +- if (dso__needs_decompress(al.map->dso)) +- unlink(objdump_name); +- ++ ret = read_via_objdump(al.map->dso->long_name, objdump_addr, buf2, len); + if (ret > 0) { + /* + * The kernel maps are inaccurate - assume objdump is right in +diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +index 7e27207d0f45..cac39532c057 100644 +--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c ++++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +@@ -1300,6 +1300,7 @@ static int intel_pt_overflow(struct intel_pt_decoder *decoder) + intel_pt_clear_tx_flags(decoder); + decoder->have_tma = false; + decoder->cbr = 0; ++ decoder->timestamp_insn_cnt = 0; + decoder->pkt_state = INTEL_PT_STATE_ERR_RESYNC; + decoder->overflow = true; + return -EOVERFLOW; +@@ -1522,6 +1523,7 @@ static int intel_pt_walk_fup_tip(struct intel_pt_decoder *decoder) + case INTEL_PT_PSBEND: + intel_pt_log("ERROR: Missing TIP after FUP\n"); + decoder->pkt_state = INTEL_PT_STATE_ERR3; ++ decoder->pkt_step = 0; + return -ENOENT; + + case INTEL_PT_OVF: +@@ -2182,14 +2184,6 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder) + return &decoder->state; + } + +-static bool intel_pt_at_psb(unsigned char *buf, size_t len) +-{ +- if (len < INTEL_PT_PSB_LEN) +- return false; +- return memmem(buf, INTEL_PT_PSB_LEN, INTEL_PT_PSB_STR, +- INTEL_PT_PSB_LEN); +-} +- + /** + * intel_pt_next_psb - move buffer pointer to the start of the next PSB packet. + * @buf: pointer to buffer pointer +@@ -2278,6 +2272,7 @@ static unsigned char *intel_pt_last_psb(unsigned char *buf, size_t len) + * @buf: buffer + * @len: size of buffer + * @tsc: TSC value returned ++ * @rem: returns remaining size when TSC is found + * + * Find a TSC packet in @buf and return the TSC value. This function assumes + * that @buf starts at a PSB and that PSB+ will contain TSC and so stops if a +@@ -2285,7 +2280,8 @@ static unsigned char *intel_pt_last_psb(unsigned char *buf, size_t len) + * + * Return: %true if TSC is found, false otherwise. + */ +-static bool intel_pt_next_tsc(unsigned char *buf, size_t len, uint64_t *tsc) ++static bool intel_pt_next_tsc(unsigned char *buf, size_t len, uint64_t *tsc, ++ size_t *rem) + { + struct intel_pt_pkt packet; + int ret; +@@ -2296,6 +2292,7 @@ static bool intel_pt_next_tsc(unsigned char *buf, size_t len, uint64_t *tsc) + return false; + if (packet.type == INTEL_PT_TSC) { + *tsc = packet.payload; ++ *rem = len; + return true; + } + if (packet.type == INTEL_PT_PSBEND) +@@ -2346,6 +2343,8 @@ static int intel_pt_tsc_cmp(uint64_t tsc1, uint64_t tsc2) + * @len_a: size of first buffer + * @buf_b: second buffer + * @len_b: size of second buffer ++ * @consecutive: returns true if there is data in buf_b that is consecutive ++ * to buf_a + * + * If the trace contains TSC we can look at the last TSC of @buf_a and the + * first TSC of @buf_b in order to determine if the buffers overlap, and then +@@ -2358,33 +2357,41 @@ static int intel_pt_tsc_cmp(uint64_t tsc1, uint64_t tsc2) + static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a, + size_t len_a, + unsigned char *buf_b, +- size_t len_b) ++ size_t len_b, bool *consecutive) + { + uint64_t tsc_a, tsc_b; + unsigned char *p; +- size_t len; ++ size_t len, rem_a, rem_b; + + p = intel_pt_last_psb(buf_a, len_a); + if (!p) + return buf_b; /* No PSB in buf_a => no overlap */ + + len = len_a - (p - buf_a); +- if (!intel_pt_next_tsc(p, len, &tsc_a)) { ++ if (!intel_pt_next_tsc(p, len, &tsc_a, &rem_a)) { + /* The last PSB+ in buf_a is incomplete, so go back one more */ + len_a -= len; + p = intel_pt_last_psb(buf_a, len_a); + if (!p) + return buf_b; /* No full PSB+ => assume no overlap */ + len = len_a - (p - buf_a); +- if (!intel_pt_next_tsc(p, len, &tsc_a)) ++ if (!intel_pt_next_tsc(p, len, &tsc_a, &rem_a)) + return buf_b; /* No TSC in buf_a => assume no overlap */ + } + + while (1) { + /* Ignore PSB+ with no TSC */ +- if (intel_pt_next_tsc(buf_b, len_b, &tsc_b) && +- intel_pt_tsc_cmp(tsc_a, tsc_b) < 0) +- return buf_b; /* tsc_a < tsc_b => no overlap */ ++ if (intel_pt_next_tsc(buf_b, len_b, &tsc_b, &rem_b)) { ++ int cmp = intel_pt_tsc_cmp(tsc_a, tsc_b); ++ ++ /* Same TSC, so buffers are consecutive */ ++ if (!cmp && rem_b >= rem_a) { ++ *consecutive = true; ++ return buf_b + len_b - (rem_b - rem_a); ++ } ++ if (cmp < 0) ++ return buf_b; /* tsc_a < tsc_b => no overlap */ ++ } + + if (!intel_pt_step_psb(&buf_b, &len_b)) + return buf_b + len_b; /* No PSB in buf_b => no data */ +@@ -2398,6 +2405,8 @@ static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a, + * @buf_b: second buffer + * @len_b: size of second buffer + * @have_tsc: can use TSC packets to detect overlap ++ * @consecutive: returns true if there is data in buf_b that is consecutive ++ * to buf_a + * + * When trace samples or snapshots are recorded there is the possibility that + * the data overlaps. Note that, for the purposes of decoding, data is only +@@ -2408,7 +2417,7 @@ static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a, + */ + unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a, + unsigned char *buf_b, size_t len_b, +- bool have_tsc) ++ bool have_tsc, bool *consecutive) + { + unsigned char *found; + +@@ -2420,7 +2429,8 @@ unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a, + return buf_b; /* No overlap */ + + if (have_tsc) { +- found = intel_pt_find_overlap_tsc(buf_a, len_a, buf_b, len_b); ++ found = intel_pt_find_overlap_tsc(buf_a, len_a, buf_b, len_b, ++ consecutive); + if (found) + return found; + } +@@ -2435,28 +2445,16 @@ unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a, + } + + /* Now len_b >= len_a */ +- if (len_b > len_a) { +- /* The leftover buffer 'b' must start at a PSB */ +- while (!intel_pt_at_psb(buf_b + len_a, len_b - len_a)) { +- if (!intel_pt_step_psb(&buf_a, &len_a)) +- return buf_b; /* No overlap */ +- } +- } +- + while (1) { + /* Potential overlap so check the bytes */ + found = memmem(buf_a, len_a, buf_b, len_a); +- if (found) ++ if (found) { ++ *consecutive = true; + return buf_b + len_a; ++ } + + /* Try again at next PSB in buffer 'a' */ + if (!intel_pt_step_psb(&buf_a, &len_a)) + return buf_b; /* No overlap */ +- +- /* The leftover buffer 'b' must start at a PSB */ +- while (!intel_pt_at_psb(buf_b + len_a, len_b - len_a)) { +- if (!intel_pt_step_psb(&buf_a, &len_a)) +- return buf_b; /* No overlap */ +- } + } + } +diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h +index 89399985fa4d..9ae4df1dcedc 100644 +--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h ++++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h +@@ -103,7 +103,7 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder); + + unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a, + unsigned char *buf_b, size_t len_b, +- bool have_tsc); ++ bool have_tsc, bool *consecutive); + + int intel_pt__strerror(int code, char *buf, size_t buflen); + +diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c +index dc041d4368c8..b1161d725ce9 100644 +--- a/tools/perf/util/intel-pt.c ++++ b/tools/perf/util/intel-pt.c +@@ -131,6 +131,7 @@ struct intel_pt_queue { + bool stop; + bool step_through_buffers; + bool use_buffer_pid_tid; ++ bool sync_switch; + pid_t pid, tid; + int cpu; + int switch_state; +@@ -194,14 +195,17 @@ static void intel_pt_dump_event(struct intel_pt *pt, unsigned char *buf, + static int intel_pt_do_fix_overlap(struct intel_pt *pt, struct auxtrace_buffer *a, + struct auxtrace_buffer *b) + { ++ bool consecutive = false; + void *start; + + start = intel_pt_find_overlap(a->data, a->size, b->data, b->size, +- pt->have_tsc); ++ pt->have_tsc, &consecutive); + if (!start) + return -EINVAL; + b->use_size = b->data + b->size - start; + b->use_data = start; ++ if (b->use_size && consecutive) ++ b->consecutive = true; + return 0; + } + +@@ -928,10 +932,12 @@ static int intel_pt_setup_queue(struct intel_pt *pt, + if (pt->timeless_decoding || !pt->have_sched_switch) + ptq->use_buffer_pid_tid = true; + } ++ ++ ptq->sync_switch = pt->sync_switch; + } + + if (!ptq->on_heap && +- (!pt->sync_switch || ++ (!ptq->sync_switch || + ptq->switch_state != INTEL_PT_SS_EXPECTING_SWITCH_EVENT)) { + const struct intel_pt_state *state; + int ret; +@@ -1333,7 +1339,7 @@ static int intel_pt_sample(struct intel_pt_queue *ptq) + if (pt->synth_opts.last_branch) + intel_pt_update_last_branch_rb(ptq); + +- if (!pt->sync_switch) ++ if (!ptq->sync_switch) + return 0; + + if (intel_pt_is_switch_ip(ptq, state->to_ip)) { +@@ -1414,6 +1420,21 @@ static u64 intel_pt_switch_ip(struct intel_pt *pt, u64 *ptss_ip) + return switch_ip; + } + ++static void intel_pt_enable_sync_switch(struct intel_pt *pt) ++{ ++ unsigned int i; ++ ++ pt->sync_switch = true; ++ ++ for (i = 0; i < pt->queues.nr_queues; i++) { ++ struct auxtrace_queue *queue = &pt->queues.queue_array[i]; ++ struct intel_pt_queue *ptq = queue->priv; ++ ++ if (ptq) ++ ptq->sync_switch = true; ++ } ++} ++ + static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp) + { + const struct intel_pt_state *state = ptq->state; +@@ -1430,7 +1451,7 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp) + if (pt->switch_ip) { + intel_pt_log("switch_ip: %"PRIx64" ptss_ip: %"PRIx64"\n", + pt->switch_ip, pt->ptss_ip); +- pt->sync_switch = true; ++ intel_pt_enable_sync_switch(pt); + } + } + } +@@ -1446,9 +1467,9 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp) + if (state->err) { + if (state->err == INTEL_PT_ERR_NODATA) + return 1; +- if (pt->sync_switch && ++ if (ptq->sync_switch && + state->from_ip >= pt->kernel_start) { +- pt->sync_switch = false; ++ ptq->sync_switch = false; + intel_pt_next_tid(pt, ptq); + } + if (pt->synth_opts.errors) { +@@ -1474,7 +1495,7 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp) + state->timestamp, state->est_timestamp); + ptq->timestamp = state->est_timestamp; + /* Use estimated TSC in unknown switch state */ +- } else if (pt->sync_switch && ++ } else if (ptq->sync_switch && + ptq->switch_state == INTEL_PT_SS_UNKNOWN && + intel_pt_is_switch_ip(ptq, state->to_ip) && + ptq->next_tid == -1) { +@@ -1621,7 +1642,7 @@ static int intel_pt_sync_switch(struct intel_pt *pt, int cpu, pid_t tid, + return 1; + + ptq = intel_pt_cpu_to_ptq(pt, cpu); +- if (!ptq) ++ if (!ptq || !ptq->sync_switch) + return 1; + + switch (ptq->switch_state) {