From f4868c99213c2b67d84e4506571216d23aa2d9fb Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Fri, 7 Nov 2014 22:48:22 +0100 Subject: [PATCH] Remove inconsistency in ARM support. This facilitates "universal" builds, ones that target multiple architectures, e.g. ARMv5 through ARMv7. See commentary in Configure for details. Reviewed-by: Ard Biesheuvel Reviewed-by: Matt Caswell (cherry picked from commit c1669e1c205dc8e695fb0c10a655f434e758b9f7) --- Configure | 30 ++++++- crypto/aes/asm/aesv8-armx.pl | 8 +- crypto/aes/asm/bsaes-armv7.pl | 8 +- crypto/arm_arch.h | 12 +++ crypto/armcap.c | 7 +- crypto/armv4cpuid.S | 140 ++++++++++++++--------------- crypto/bn/asm/armv4-gf2m.pl | 130 ++++++++++++++------------- crypto/bn/asm/armv4-mont.pl | 9 +- crypto/evp/e_aes.c | 2 +- crypto/modes/asm/ghash-armv4.pl | 3 +- crypto/modes/gcm128.c | 2 +- crypto/sha/asm/sha1-armv4-large.pl | 11 ++- crypto/sha/asm/sha256-armv4.pl | 11 ++- crypto/sha/asm/sha512-armv4.pl | 11 ++- 14 files changed, 224 insertions(+), 160 deletions(-) diff --git a/Configure b/Configure index 4d943e1a1..0850d900d 100755 --- a/Configure +++ b/Configure @@ -351,8 +351,34 @@ my %table=( # throw in -D[BL]_ENDIAN, whichever appropriate... "linux-generic32","gcc:-DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ppc", "gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${ppc32_asm}:linux32:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -# It's believed that majority of ARM toolchains predefine appropriate -march. -# If you compiler does not, do complement config command line with one! + +####################################################################### +# Note that -march is not among compiler options in below linux-armv4 +# target line. Not specifying one is intentional to give you choice to: +# +# a) rely on your compiler default by not specifying one; +# b) specify your target platform explicitly for optimal performance, +# e.g. -march=armv6 or -march=armv7-a; +# c) build "universal" binary that targets *range* of platforms by +# specifying minimum and maximum supported architecture; +# +# As for c) option. It actually makes no sense to specify maximum to be +# less than ARMv7, because it's the least requirement for run-time +# switch between platform-specific code paths. And without run-time +# switch performance would be equivalent to one for minimum. Secondly, +# there are some natural limitations that you'd have to accept and +# respect. Most notably you can *not* build "universal" binary for +# big-endian platform. This is because ARMv7 processor always picks +# instructions in little-endian order. Another similar limitation is +# that -mthumb can't "cross" -march=armv6t2 boundary, because that's +# where it became Thumb-2. Well, this limitation is a bit artificial, +# because it's not really impossible, but it's deemed too tricky to +# support. And of course you have to be sure that your binutils are +# actually up to the task of handling maximum target platform. With all +# this in mind here is an example of how to configure "universal" build: +# +# ./Configure linux-armv4 -march=armv6 -D__ARM_MAX_ARCH__=8 +# "linux-armv4", "gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-aarch64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${aarch64_asm}:linux64:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", # Configure script adds minimally required -march for assembly support, diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl index 923c7f62d..1e93f8685 100755 --- a/crypto/aes/asm/aesv8-armx.pl +++ b/crypto/aes/asm/aesv8-armx.pl @@ -35,11 +35,13 @@ $prefix="aes_v8"; $code=<<___; #include "arm_arch.h" -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 .text ___ -$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); -$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); +$code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/); + #^^^^^^ this is done to simplify adoption by not depending + # on latest binutils. # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to diff --git a/crypto/aes/asm/bsaes-armv7.pl b/crypto/aes/asm/bsaes-armv7.pl index f3d96d932..fcc81d1a4 100644 --- a/crypto/aes/asm/bsaes-armv7.pl +++ b/crypto/aes/asm/bsaes-armv7.pl @@ -702,13 +702,17 @@ $code.=<<___; # define BSAES_ASM_EXTENDED_KEY # define XTS_CHAIN_TWEAK # define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ #endif #ifdef __thumb__ # define adrl adr #endif -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + .text .syntax unified @ ARMv7-capable assembler is expected to handle this #ifdef __thumb2__ @@ -717,8 +721,6 @@ $code.=<<___; .code 32 #endif -.fpu neon - .type _bsaes_decrypt8,%function .align 4 _bsaes_decrypt8: diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h index 6fa87244d..d137c926f 100644 --- a/crypto/arm_arch.h +++ b/crypto/arm_arch.h @@ -52,6 +52,18 @@ #include #endif +#if !defined(__ARM_MAX_ARCH__) +# define __ARM_MAX_ARCH__ __ARM_ARCH__ +#endif + +#if __ARM_MAX_ARCH__<__ARM_ARCH__ +# error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__" +#elif __ARM_MAX_ARCH__!=__ARM_ARCH__ +# if __ARM_ARCH__<7 && __ARM_MAX_ARCH__>=7 && defined(__ARMEB__) +# error "can't build universal big-endian binary" +# endif +#endif + #if !__ASSEMBLER__ extern unsigned int OPENSSL_armcap_P; #endif diff --git a/crypto/armcap.c b/crypto/armcap.c index 7e46d07a3..24f7a0829 100644 --- a/crypto/armcap.c +++ b/crypto/armcap.c @@ -7,8 +7,12 @@ #include "arm_arch.h" -unsigned int OPENSSL_armcap_P; +unsigned int OPENSSL_armcap_P=0; +#if __ARM_MAX_ARCH__<7 +void OPENSSL_cpuid_setup(void) {} +unsigned long OPENSSL_rdtsc(void) { return 0; } +#else static sigset_t all_masked; static sigjmp_buf ill_jmp; @@ -155,3 +159,4 @@ void OPENSSL_cpuid_setup(void) sigaction (SIGILL,&ill_oact,NULL); sigprocmask(SIG_SETMASK,&oset,NULL); } +#endif diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S index 005931130..65010ae4f 100644 --- a/crypto/armv4cpuid.S +++ b/crypto/armv4cpuid.S @@ -3,69 +3,6 @@ .text .code 32 -@ Special note about using .byte directives to encode instructions. -@ Initial reason for hand-coding instructions was to allow module to -@ be compilable by legacy tool-chains. At later point it was pointed -@ out that since ARMv7, instructions are always encoded in little-endian -@ order, therefore one has to opt for endian-neutral presentation. -@ Contemporary tool-chains offer .inst directive for this purpose, -@ but not legacy ones. Therefore .byte. But there is an exception, -@ namely ARMv7-R profile still allows for big-endian encoding even for -@ instructions. This raises the question what if probe instructions -@ appear executable to such processor operating in big-endian order? -@ They have to be chosen in a way that avoids this problem. As failed -@ NEON probe disables a number of other probes we have to ensure that -@ only NEON probe instruction doesn't appear executable in big-endian -@ order, therefore 'vorr q8,q8,q8', and not some other register. The -@ only probe that is not bypassed on failed NEON probe is _armv7_tick, -@ where you'll spot 'mov r0,r6' that serves this purpose. Basic idea is -@ that if fetched in alternative byte oder instruction should crash to -@ denote lack of probed capability... - -.align 5 -.global _armv7_neon_probe -.type _armv7_neon_probe,%function -_armv7_neon_probe: - .byte 0xf0,0x01,0x60,0xf2 @ vorr q8,q8,q8 - .byte 0x1e,0xff,0x2f,0xe1 @ bx lr -.size _armv7_neon_probe,.-_armv7_neon_probe - -.global _armv7_tick -.type _armv7_tick,%function -_armv7_tick: - .byte 0x06,0x00,0xa0,0xe1 @ mov r0,r6 - .byte 0x1e,0x0f,0x51,0xec @ mrrc p15,1,r0,r1,c14 @ CNTVCT - .byte 0x1e,0xff,0x2f,0xe1 @ bx lr - nop -.size _armv7_tick,.-_armv7_tick - -.global _armv8_aes_probe -.type _armv8_aes_probe,%function -_armv8_aes_probe: - .byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0 - .byte 0x1e,0xff,0x2f,0xe1 @ bx lr -.size _armv8_aes_probe,.-_armv8_aes_probe - -.global _armv8_sha1_probe -.type _armv8_sha1_probe,%function -_armv8_sha1_probe: - .byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0 - .byte 0x1e,0xff,0x2f,0xe1 @ bx lr -.size _armv8_sha1_probe,.-_armv8_sha1_probe - -.global _armv8_sha256_probe -.type _armv8_sha256_probe,%function -_armv8_sha256_probe: - .byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0 - .byte 0x1e,0xff,0x2f,0xe1 @ bx lr -.size _armv8_sha256_probe,.-_armv8_sha256_probe -.global _armv8_pmull_probe -.type _armv8_pmull_probe,%function -_armv8_pmull_probe: - .byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0 - .byte 0x1e,0xff,0x2f,0xe1 @ bx lr -.size _armv8_pmull_probe,.-_armv8_pmull_probe - .align 5 .global OPENSSL_atomic_add .type OPENSSL_atomic_add,%function @@ -139,30 +76,81 @@ OPENSSL_cleanse: #endif .size OPENSSL_cleanse,.-OPENSSL_cleanse +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.align 5 +.global _armv7_neon_probe +.type _armv7_neon_probe,%function +_armv7_neon_probe: + vorr q0,q0,q0 + bx lr +.size _armv7_neon_probe,.-_armv7_neon_probe + +.global _armv7_tick +.type _armv7_tick,%function +_armv7_tick: + mrrc p15,1,r0,r1,c14 @ CNTVCT + bx lr +.size _armv7_tick,.-_armv7_tick + +.global _armv8_aes_probe +.type _armv8_aes_probe,%function +_armv8_aes_probe: + .byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0 + bx lr +.size _armv8_aes_probe,.-_armv8_aes_probe + +.global _armv8_sha1_probe +.type _armv8_sha1_probe,%function +_armv8_sha1_probe: + .byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0 + bx lr +.size _armv8_sha1_probe,.-_armv8_sha1_probe + +.global _armv8_sha256_probe +.type _armv8_sha256_probe,%function +_armv8_sha256_probe: + .byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0 + bx lr +.size _armv8_sha256_probe,.-_armv8_sha256_probe +.global _armv8_pmull_probe +.type _armv8_pmull_probe,%function +_armv8_pmull_probe: + .byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0 + bx lr +.size _armv8_pmull_probe,.-_armv8_pmull_probe +#endif + .global OPENSSL_wipe_cpu .type OPENSSL_wipe_cpu,%function OPENSSL_wipe_cpu: +#if __ARM_MAX_ARCH__>=7 ldr r0,.LOPENSSL_armcap adr r1,.LOPENSSL_armcap ldr r0,[r1,r0] +#endif eor r2,r2,r2 eor r3,r3,r3 eor ip,ip,ip +#if __ARM_MAX_ARCH__>=7 tst r0,#1 beq .Lwipe_done - .byte 0x50,0x01,0x00,0xf3 @ veor q0, q0, q0 - .byte 0x52,0x21,0x02,0xf3 @ veor q1, q1, q1 - .byte 0x54,0x41,0x04,0xf3 @ veor q2, q2, q2 - .byte 0x56,0x61,0x06,0xf3 @ veor q3, q3, q3 - .byte 0xf0,0x01,0x40,0xf3 @ veor q8, q8, q8 - .byte 0xf2,0x21,0x42,0xf3 @ veor q9, q9, q9 - .byte 0xf4,0x41,0x44,0xf3 @ veor q10, q10, q10 - .byte 0xf6,0x61,0x46,0xf3 @ veor q11, q11, q11 - .byte 0xf8,0x81,0x48,0xf3 @ veor q12, q12, q12 - .byte 0xfa,0xa1,0x4a,0xf3 @ veor q13, q13, q13 - .byte 0xfc,0xc1,0x4c,0xf3 @ veor q14, q14, q14 - .byte 0xfe,0xe1,0x4e,0xf3 @ veor q14, q14, q14 + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + veor q3, q3, q3 + veor q8, q8, q8 + veor q9, q9, q9 + veor q10, q10, q10 + veor q11, q11, q11 + veor q12, q12, q12 + veor q13, q13, q13 + veor q14, q14, q14 + veor q15, q15, q15 .Lwipe_done: +#endif mov r0,sp #if __ARM_ARCH__>=5 bx lr @@ -200,8 +188,10 @@ OPENSSL_instrument_bus2: .size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2 .align 5 +#if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: .word OPENSSL_armcap_P-.LOPENSSL_armcap +#endif #if __ARM_ARCH__>=6 .align 5 #else diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl index b781afbf8..8f529c95c 100644 --- a/crypto/bn/asm/armv4-gf2m.pl +++ b/crypto/bn/asm/armv4-gf2m.pl @@ -40,10 +40,6 @@ $code=<<___; .text .code 32 - -#if __ARM_ARCH__>=7 -.fpu neon -#endif ___ ################ # private interface to mul_1x1_ialu @@ -142,72 +138,18 @@ ___ # BN_ULONG a1,BN_ULONG a0, # BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 { -my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12)); -my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31)); - $code.=<<___; .global bn_GF2m_mul_2x2 .type bn_GF2m_mul_2x2,%function .align 5 bn_GF2m_mul_2x2: -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 ldr r12,.LOPENSSL_armcap .Lpic: ldr r12,[pc,r12] tst r12,#1 - beq .Lialu - - ldr r12, [sp] @ 5th argument - vmov.32 $a, r2, r1 - vmov.32 $b, r12, r3 - vmov.i64 $k48, #0x0000ffffffffffff - vmov.i64 $k32, #0x00000000ffffffff - vmov.i64 $k16, #0x000000000000ffff - - vext.8 $t0#lo, $a, $a, #1 @ A1 - vmull.p8 $t0, $t0#lo, $b @ F = A1*B - vext.8 $r#lo, $b, $b, #1 @ B1 - vmull.p8 $r, $a, $r#lo @ E = A*B1 - vext.8 $t1#lo, $a, $a, #2 @ A2 - vmull.p8 $t1, $t1#lo, $b @ H = A2*B - vext.8 $t3#lo, $b, $b, #2 @ B2 - vmull.p8 $t3, $a, $t3#lo @ G = A*B2 - vext.8 $t2#lo, $a, $a, #3 @ A3 - veor $t0, $t0, $r @ L = E + F - vmull.p8 $t2, $t2#lo, $b @ J = A3*B - vext.8 $r#lo, $b, $b, #3 @ B3 - veor $t1, $t1, $t3 @ M = G + H - vmull.p8 $r, $a, $r#lo @ I = A*B3 - veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 - vand $t0#hi, $t0#hi, $k48 - vext.8 $t3#lo, $b, $b, #4 @ B4 - veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 - vand $t1#hi, $t1#hi, $k32 - vmull.p8 $t3, $a, $t3#lo @ K = A*B4 - veor $t2, $t2, $r @ N = I + J - veor $t0#lo, $t0#lo, $t0#hi - veor $t1#lo, $t1#lo, $t1#hi - veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 - vand $t2#hi, $t2#hi, $k16 - vext.8 $t0, $t0, $t0, #15 - veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 - vmov.i64 $t3#hi, #0 - vext.8 $t1, $t1, $t1, #14 - veor $t2#lo, $t2#lo, $t2#hi - vmull.p8 $r, $a, $b @ D = A*B - vext.8 $t3, $t3, $t3, #12 - vext.8 $t2, $t2, $t2, #13 - veor $t0, $t0, $t1 - veor $t2, $t2, $t3 - veor $r, $r, $t0 - veor $r, $r, $t2 - - vst1.32 {$r}, [r0] - ret @ bx lr -.align 4 -.Lialu: + bne .LNEON #endif ___ -} $ret="r10"; # reassigned 1st argument $code.=<<___; stmdb sp!,{r4-r10,lr} @@ -257,8 +199,72 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif +___ +} +{ +my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12)); +my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31)); + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.align 5 +.LNEON: + ldr r12, [sp] @ 5th argument + vmov.32 $a, r2, r1 + vmov.32 $b, r12, r3 + vmov.i64 $k48, #0x0000ffffffffffff + vmov.i64 $k32, #0x00000000ffffffff + vmov.i64 $k16, #0x000000000000ffff + + vext.8 $t0#lo, $a, $a, #1 @ A1 + vmull.p8 $t0, $t0#lo, $b @ F = A1*B + vext.8 $r#lo, $b, $b, #1 @ B1 + vmull.p8 $r, $a, $r#lo @ E = A*B1 + vext.8 $t1#lo, $a, $a, #2 @ A2 + vmull.p8 $t1, $t1#lo, $b @ H = A2*B + vext.8 $t3#lo, $b, $b, #2 @ B2 + vmull.p8 $t3, $a, $t3#lo @ G = A*B2 + vext.8 $t2#lo, $a, $a, #3 @ A3 + veor $t0, $t0, $r @ L = E + F + vmull.p8 $t2, $t2#lo, $b @ J = A3*B + vext.8 $r#lo, $b, $b, #3 @ B3 + veor $t1, $t1, $t3 @ M = G + H + vmull.p8 $r, $a, $r#lo @ I = A*B3 + veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 + vand $t0#hi, $t0#hi, $k48 + vext.8 $t3#lo, $b, $b, #4 @ B4 + veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 + vand $t1#hi, $t1#hi, $k32 + vmull.p8 $t3, $a, $t3#lo @ K = A*B4 + veor $t2, $t2, $r @ N = I + J + veor $t0#lo, $t0#lo, $t0#hi + veor $t1#lo, $t1#lo, $t1#hi + veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 + vand $t2#hi, $t2#hi, $k16 + vext.8 $t0, $t0, $t0, #15 + veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 + vmov.i64 $t3#hi, #0 + vext.8 $t1, $t1, $t1, #14 + veor $t2#lo, $t2#lo, $t2#hi + vmull.p8 $r, $a, $b @ D = A*B + vext.8 $t3, $t3, $t3, #12 + vext.8 $t2, $t2, $t2, #13 + veor $t0, $t0, $t1 + veor $t2, $t2, $t3 + veor $r, $r, $t0 + veor $r, $r, $t2 + + vst1.32 {$r}, [r0] + ret @ bx lr +#endif +___ +} +$code.=<<___; .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: .word OPENSSL_armcap_P-(.Lpic+8) @@ -266,7 +272,9 @@ $code.=<<___; .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by " .align 5 +#if __ARM_MAX_ARCH__>=7 .comm OPENSSL_armcap_P,4,4 +#endif ___ foreach (split("\n",$code)) { diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl index 72bad8e30..1d330e9f8 100644 --- a/crypto/bn/asm/armv4-mont.pl +++ b/crypto/bn/asm/armv4-mont.pl @@ -72,7 +72,7 @@ $code=<<___; .text .code 32 -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: .word OPENSSL_armcap_P-bn_mul_mont @@ -85,7 +85,7 @@ $code=<<___; bn_mul_mont: ldr ip,[sp,#4] @ load num stmdb sp!,{r0,r2} @ sp points at argument block -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 tst ip,#7 bne .Lialu adr r0,bn_mul_mont @@ -256,7 +256,8 @@ my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); $code.=<<___; -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a .fpu neon .type bn_mul8x_mont_neon,%function @@ -663,7 +664,7 @@ ___ $code.=<<___; .asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by " .align 2 -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 .comm OPENSSL_armcap_P,4,4 #endif ___ diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c index 2a18a562e..36091e4b4 100644 --- a/crypto/evp/e_aes.c +++ b/crypto/evp/e_aes.c @@ -911,7 +911,7 @@ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \ #if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__)) #include "arm_arch.h" -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 # if defined(BSAES_ASM) # define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON) # endif diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl index 0023bf994..77fbf3446 100644 --- a/crypto/modes/asm/ghash-armv4.pl +++ b/crypto/modes/asm/ghash-armv4.pl @@ -365,7 +365,8 @@ ___ } $code.=<<___; -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a .fpu neon .global gcm_init_neon diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c index 878c9930d..24ad40a36 100644 --- a/crypto/modes/gcm128.c +++ b/crypto/modes/gcm128.c @@ -675,7 +675,7 @@ void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len # endif # elif defined(__arm__) || defined(__arm) || defined(__aarch64__) # include "arm_arch.h" -# if __ARM_ARCH__>=7 +# if __ARM_MAX_ARCH__>=7 # define GHASH_ASM_ARM # define GCM_FUNCREF_4BIT # define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL) diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl index 50bd07b33..b2c30322c 100644 --- a/crypto/sha/asm/sha1-armv4-large.pl +++ b/crypto/sha/asm/sha1-armv4-large.pl @@ -174,7 +174,7 @@ $code=<<___; .align 5 sha1_block_data_order: -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 sub r3,pc,#8 @ sha1_block_data_order ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P @@ -264,8 +264,10 @@ $code.=<<___; .LK_20_39: .word 0x6ed9eba1 .LK_40_59: .word 0x8f1bbcdc .LK_60_79: .word 0xca62c1d6 +#if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: .word OPENSSL_armcap_P-sha1_block_data_order +#endif .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " .align 5 ___ @@ -476,7 +478,8 @@ sub Xloop() } $code.=<<___; -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a .fpu neon .type sha1_block_data_order_neon,%function @@ -563,7 +566,7 @@ my @Kxx=map("q$_",(8..11)); my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14)); $code.=<<___; -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 .type sha1_block_data_order_armv8,%function .align 5 sha1_block_data_order_armv8: @@ -637,7 +640,9 @@ $code.=<<___; ___ }}} $code.=<<___; +#if __ARM_MAX_ARCH__>=7 .comm OPENSSL_armcap_P,4,4 +#endif ___ { my %opcode = ( diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl index 505ca8f35..b0ae93633 100644 --- a/crypto/sha/asm/sha256-armv4.pl +++ b/crypto/sha/asm/sha256-armv4.pl @@ -177,8 +177,10 @@ K256: .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .size K256,.-K256 .word 0 @ terminator +#if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: .word OPENSSL_armcap_P-sha256_block_data_order +#endif .align 5 .global sha256_block_data_order @@ -186,7 +188,7 @@ K256: sha256_block_data_order: sub r3,pc,#8 @ sha256_block_data_order add $len,$inp,$len,lsl#6 @ len to point at the end of inp -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P tst r12,#ARMV8_SHA256 @@ -423,7 +425,8 @@ sub body_00_15 () { } $code.=<<___; -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a .fpu neon .type sha256_block_data_order_neon,%function @@ -545,7 +548,7 @@ my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); my $Ktbl="r3"; $code.=<<___; -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 .type sha256_block_data_order_armv8,%function .align 5 sha256_block_data_order_armv8: @@ -616,7 +619,9 @@ ___ $code.=<<___; .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " .align 2 +#if __ARM_MARCH_ARCH__>=7 .comm OPENSSL_armcap_P,4,4 +#endif ___ { my %opcode = ( diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl index 1d5275b91..fb7dc506a 100644 --- a/crypto/sha/asm/sha512-armv4.pl +++ b/crypto/sha/asm/sha512-armv4.pl @@ -237,16 +237,20 @@ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 +#if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: .word OPENSSL_armcap_P-sha512_block_data_order .skip 32-4 +#else +.skip 32 +#endif .global sha512_block_data_order .type sha512_block_data_order,%function sha512_block_data_order: sub r3,pc,#8 @ sha512_block_data_order add $len,$inp,$len,lsl#7 @ len to point at the end of inp -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P tst r12,#1 @@ -551,7 +555,8 @@ ___ } $code.=<<___; -#if __ARM_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a .fpu neon .align 4 @@ -592,7 +597,9 @@ $code.=<<___; .size sha512_block_data_order,.-sha512_block_data_order .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by " .align 2 +#if __ARM_MAX_ARCH__>=7 .comm OPENSSL_armcap_P,4,4 +#endif ___ $code =~ s/\`([^\`]*)\`/eval $1/gem;