Adapt ARM assembly pack for iOS.
This is achieved by filtering perlasm output through arm-xlate.pl. But note that it's done only if "flavour" argument is not 'void'. As 'void' is default value for other ARM targets, permasm output is not actually filtered on previously validated platforms. Reviewed-by: Dr. Stephen Henson <steve@openssl.org> (cherry picked from commit 874faf2ffb22187ad5483d9691a3a2eb7112f161)
This commit is contained in:
parent
728b53058e
commit
bb98f6bef6
crypto
aes/asm
bn/asm
modes/asm
sha/asm
@ -32,8 +32,20 @@
|
||||
# Profiler-assisted and platform-specific optimization resulted in 16%
|
||||
# improvement on Cortex A8 core and ~21.5 cycles per byte.
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
$flavour = shift;
|
||||
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
$s0="r0";
|
||||
$s1="r1";
|
||||
@ -171,7 +183,12 @@ AES_encrypt:
|
||||
stmdb sp!,{r1,r4-r12,lr}
|
||||
mov $rounds,r0 @ inp
|
||||
mov $key,r2
|
||||
#ifdef __APPLE__
|
||||
mov $tbl,#AES_encrypt-AES_Te
|
||||
sub $tbl,r3,$tbl @ Te
|
||||
#else
|
||||
sub $tbl,r3,#AES_encrypt-AES_Te @ Te
|
||||
#endif
|
||||
#if __ARM_ARCH__<7
|
||||
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
|
||||
ldrb $t1,[$rounds,#2] @ manner...
|
||||
@ -425,7 +442,12 @@ AES_set_encrypt_key:
|
||||
bne .Labrt
|
||||
|
||||
.Lok: stmdb sp!,{r4-r12,lr}
|
||||
#ifdef __APPLE__
|
||||
mov $tbl,#AES_set_encrypt_key-AES_Te-1024
|
||||
sub $tbl,r3,$tbl @ Te4
|
||||
#else
|
||||
sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4
|
||||
#endif
|
||||
|
||||
mov $rounds,r0 @ inp
|
||||
mov lr,r1 @ bits
|
||||
@ -886,7 +908,12 @@ AES_decrypt:
|
||||
stmdb sp!,{r1,r4-r12,lr}
|
||||
mov $rounds,r0 @ inp
|
||||
mov $key,r2
|
||||
#ifdef __APPLE__
|
||||
mov $tbl,#AES_decrypt-AES_Td
|
||||
sub $tbl,r3,$tbl @ Td
|
||||
#else
|
||||
sub $tbl,r3,#AES_decrypt-AES_Td @ Td
|
||||
#endif
|
||||
#if __ARM_ARCH__<7
|
||||
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
|
||||
ldrb $t1,[$rounds,#2] @ manner...
|
||||
|
@ -21,8 +21,20 @@
|
||||
# runs in even less cycles, ~30, improvement is measurable only on
|
||||
# longer keys. One has to optimize code elsewhere to get NEON glow...
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
$flavour = shift;
|
||||
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
|
||||
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
|
||||
@ -170,11 +182,18 @@ bn_GF2m_mul_2x2:
|
||||
#if __ARM_ARCH__>=7
|
||||
ldr r12,.LOPENSSL_armcap
|
||||
.Lpic: ldr r12,[pc,r12]
|
||||
#ifdef __APPLE__
|
||||
ldr r12,[r12]
|
||||
#endif
|
||||
tst r12,#1
|
||||
beq .Lialu
|
||||
|
||||
veor $A1,$A1
|
||||
#ifdef __APPLE__
|
||||
vmov $B1,r3,r3 @ two copies of b1
|
||||
#else
|
||||
vmov.32 $B1,r3,r3 @ two copies of b1
|
||||
#endif
|
||||
vmov.32 ${A1}[0],r1 @ a1
|
||||
|
||||
veor $A0,$A0
|
||||
|
@ -23,8 +23,20 @@
|
||||
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
|
||||
# about decorations, ABI and instruction syntax are identical.
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
$flavour = shift;
|
||||
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
$num="r0"; # starts as num argument, but holds &tp[num-1]
|
||||
$ap="r1";
|
||||
|
@ -57,8 +57,20 @@
|
||||
# *native* byte order on current platform. See gcm128.c for working
|
||||
# example...
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
$flavour = shift;
|
||||
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
$Xi="r0"; # argument block
|
||||
$Htbl="r1";
|
||||
@ -112,6 +124,11 @@ $code=<<___;
|
||||
.text
|
||||
.code 32
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define ldrplb ldrbpl
|
||||
#define ldrneb ldrbne
|
||||
#endif
|
||||
|
||||
.type rem_4bit,%object
|
||||
.align 5
|
||||
rem_4bit:
|
||||
@ -326,9 +343,9 @@ $code.=<<___;
|
||||
.align 4
|
||||
gcm_gmult_neon:
|
||||
sub $Htbl,#16 @ point at H in GCM128_CTX
|
||||
vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
|
||||
vld1.64 `&Dhi("$IN")`,[$Xi]! @ load Xi
|
||||
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
|
||||
vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
|
||||
vld1.64 `&Dlo("$IN")`,[$Xi]!
|
||||
vshr.u64 $mod,#32
|
||||
vldmia $Htbl,{$Hhi-$Hlo} @ load H
|
||||
veor $zero,$zero
|
||||
@ -349,9 +366,9 @@ gcm_gmult_neon:
|
||||
.type gcm_ghash_neon,%function
|
||||
.align 4
|
||||
gcm_ghash_neon:
|
||||
vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
|
||||
vld1.64 `&Dhi("$Z")`,[$Xi]! @ load Xi
|
||||
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
|
||||
vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
|
||||
vld1.64 `&Dlo("$Z")`,[$Xi]!
|
||||
vshr.u64 $mod,#32
|
||||
vldmia $Xi,{$Hhi-$Hlo} @ load H
|
||||
veor $zero,$zero
|
||||
@ -410,8 +427,8 @@ gcm_ghash_neon:
|
||||
vrev64.8 $Z,$Z
|
||||
#endif
|
||||
sub $Xi,#16
|
||||
vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
|
||||
vst1.64 `&Dlo("$Z")`,[$Xi,:64]
|
||||
vst1.64 `&Dhi("$Z")`,[$Xi]! @ write out Xi
|
||||
vst1.64 `&Dlo("$Z")`,[$Xi]
|
||||
|
||||
bx lr
|
||||
.size gcm_ghash_neon,.-gcm_ghash_neon
|
||||
|
@ -52,8 +52,20 @@
|
||||
# Profiler-assisted and platform-specific optimization resulted in 10%
|
||||
# improvement on Cortex A8 core and 12.2 cycles per byte.
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
$flavour = shift;
|
||||
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
$ctx="r0";
|
||||
$inp="r1";
|
||||
|
@ -23,8 +23,20 @@
|
||||
# Profiler-assisted and platform-specific optimization resulted in 16%
|
||||
# improvement on Cortex A8 core and ~17 cycles per processed byte.
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
$flavour = shift;
|
||||
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
$ctx="r0"; $t0="r0";
|
||||
$inp="r1"; $t3="r1";
|
||||
|
@ -38,8 +38,20 @@ $hi="HI";
|
||||
$lo="LO";
|
||||
# ====================================================================
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
$flavour = shift;
|
||||
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
$ctx="r0"; # parameter block
|
||||
$inp="r1";
|
||||
@ -221,17 +233,21 @@ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
|
||||
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
|
||||
.size K512,.-K512
|
||||
.LOPENSSL_armcap:
|
||||
.word OPENSSL_armcap_P-sha512_block_data_order
|
||||
.word OPENSSL_armcap_P-.Lsha512_block_data_order
|
||||
.skip 32-4
|
||||
|
||||
.global sha512_block_data_order
|
||||
.type sha512_block_data_order,%function
|
||||
sha512_block_data_order:
|
||||
.Lsha512_block_data_order:
|
||||
sub r3,pc,#8 @ sha512_block_data_order
|
||||
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
|
||||
#if __ARM_ARCH__>=7
|
||||
ldr r12,.LOPENSSL_armcap
|
||||
ldr r12,[r3,r12] @ OPENSSL_armcap_P
|
||||
#ifdef __APPLE__
|
||||
ldr r12,[r12]
|
||||
#endif
|
||||
tst r12,#1
|
||||
bne .LNEON
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user