ARM assembler pack: profiler-assisted optimizations and NEON support.

This commit is contained in:
Andy Polyakov 2011-04-01 20:58:34 +00:00
parent d8d958323b
commit 1e86318091
13 changed files with 715 additions and 224 deletions

View File

@ -398,8 +398,9 @@ my %table=(
"linux-alpha-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}", "linux-alpha-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
"linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}", "linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
# Android: linux-armv4 but without -DTERMIO and pointers to headers and libs. # Android: linux-* but without -DTERMIO and pointers to headers and libs.
"android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"android-armv7","gcc:-march=armv7-a -mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
#### *BSD [do see comment about ${BSDthreads} above!] #### *BSD [do see comment about ${BSDthreads} above!]
"BSD-generic32","gcc:-DTERMIOS -O3 -fomit-frame-pointer -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "BSD-generic32","gcc:-DTERMIOS -O3 -fomit-frame-pointer -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",

32
TABLE
View File

@ -1001,6 +1001,38 @@ $sys_id =
$lflags = -ldl $lflags = -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR
$cpuid_obj = $cpuid_obj =
$bn_obj =
$des_obj =
$aes_obj =
$bf_obj =
$md5_obj =
$sha1_obj =
$cast_obj =
$rc4_obj =
$rmd160_obj =
$rc5_obj =
$wp_obj =
$cmll_obj =
$modes_obj =
$perlasm_scheme = void
$dso_scheme = dlfcn
$shared_target= linux-shared
$shared_cflag = -fPIC
$shared_ldflag =
$shared_extension = .so.$(SHLIB_MAJOR).$(SHLIB_MINOR)
$ranlib =
$arflags =
$multilib =
*** android-armv7
$cc = gcc
$cflags = -march=armv7-a -mandroid -I$(ANDROID_DEV)/include -B$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall
$unistd =
$thread_cflag = -D_REENTRANT
$sys_id =
$lflags = -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR
$cpuid_obj =
$bn_obj = bn_asm.o armv4-mont.o $bn_obj = bn_asm.o armv4-mont.o
$des_obj = $des_obj =
$aes_obj = aes_cbc.o aes-armv4.o $aes_obj = aes_cbc.o aes-armv4.o

1
config
View File

@ -821,6 +821,7 @@ case "$GUESSOS" in
beos-*) OUT="$GUESSOS" ;; beos-*) OUT="$GUESSOS" ;;
x86pc-*-qnx6) OUT="QNX6-i386" ;; x86pc-*-qnx6) OUT="QNX6-i386" ;;
*-*-qnx6) OUT="QNX6" ;; *-*-qnx6) OUT="QNX6" ;;
armv[7-9]*-*-android) OUT="android-armv7" ;;
*) OUT=`echo $GUESSOS | awk -F- '{print $3}'`;; *) OUT=`echo $GUESSOS | awk -F- '{print $3}'`;;
esac esac

View File

@ -68,7 +68,8 @@ aes-parisc.s: asm/aes-parisc.pl
$(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@ $(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@
# GNU make "catch all" # GNU make "catch all"
aes-%.s: asm/aes-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ aes-%.S: asm/aes-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
aes-armv4.o: aes-armv4.S
files: files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO

View File

@ -27,6 +27,11 @@
# Rescheduling for dual-issue pipeline resulted in 12% improvement on # Rescheduling for dual-issue pipeline resulted in 12% improvement on
# Cortex A8 core and ~25 cycles per byte processed with 128-bit key. # Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~21.5 cycles per byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output"; open STDOUT,">$output";
@ -46,6 +51,7 @@ $key="r11";
$rounds="r12"; $rounds="r12";
$code=<<___; $code=<<___;
#include "arm_arch.h"
.text .text
.code 32 .code 32
@ -166,7 +172,7 @@ AES_encrypt:
mov $rounds,r0 @ inp mov $rounds,r0 @ inp
mov $key,r2 mov $key,r2
sub $tbl,r3,#AES_encrypt-AES_Te @ Te sub $tbl,r3,#AES_encrypt-AES_Te @ Te
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner... ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1] ldrb $t2,[$rounds,#1]
@ -195,10 +201,33 @@ AES_encrypt:
orr $s3,$s3,$t1,lsl#8 orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16 orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24 orr $s3,$s3,$t3,lsl#24
#else
ldr $s0,[$rounds,#0]
ldr $s1,[$rounds,#4]
ldr $s2,[$rounds,#8]
ldr $s3,[$rounds,#12]
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
#endif
bl _armv4_AES_encrypt bl _armv4_AES_encrypt
ldr $rounds,[sp],#4 @ pop out ldr $rounds,[sp],#4 @ pop out
#if __ARM_ARCH__>=7
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
str $s0,[$rounds,#0]
str $s1,[$rounds,#4]
str $s2,[$rounds,#8]
str $s3,[$rounds,#12]
#else
mov $t1,$s0,lsr#24 @ write output in endian-neutral mov $t1,$s0,lsr#24 @ write output in endian-neutral
mov $t2,$s0,lsr#16 @ manner... mov $t2,$s0,lsr#16 @ manner...
mov $t3,$s0,lsr#8 mov $t3,$s0,lsr#8
@ -227,11 +256,15 @@ AES_encrypt:
strb $t2,[$rounds,#13] strb $t2,[$rounds,#13]
strb $t3,[$rounds,#14] strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15] strb $s3,[$rounds,#15]
#endif
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4-r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
#endif
.size AES_encrypt,.-AES_encrypt .size AES_encrypt,.-AES_encrypt
.type _armv4_AES_encrypt,%function .type _armv4_AES_encrypt,%function
@ -271,11 +304,11 @@ _armv4_AES_encrypt:
and $i2,lr,$s2,lsr#16 @ i1 and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$t3,$i3,ror#8 eor $t3,$t3,$i3,ror#8
and $i3,lr,$s2 and $i3,lr,$s2
eor $s1,$s1,$t1,ror#24
ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8] ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
eor $s1,$s1,$t1,ror#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
mov $s2,$s2,lsr#24 mov $s2,$s2,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0] ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
eor $s0,$s0,$i1,ror#16 eor $s0,$s0,$i1,ror#16
ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24] ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
@ -284,16 +317,16 @@ _armv4_AES_encrypt:
and $i2,lr,$s3,lsr#8 @ i1 and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$t3,$i3,ror#16 eor $t3,$t3,$i3,ror#16
and $i3,lr,$s3,lsr#16 @ i2 and $i3,lr,$s3,lsr#16 @ i2
eor $s2,$s2,$t2,ror#16
ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0] ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
eor $s2,$s2,$t2,ror#16
ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
mov $s3,$s3,lsr#24 mov $s3,$s3,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16] ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
eor $s0,$s0,$i1,ror#24 eor $s0,$s0,$i1,ror#24
ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s1,$s1,$i2,ror#16
ldr $i1,[$key],#16 ldr $i1,[$key],#16
eor $s1,$s1,$i2,ror#16
ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s2,$s2,$i3,ror#8 eor $s2,$s2,$i3,ror#8
ldr $t1,[$key,#-12] ldr $t1,[$key,#-12]
eor $s3,$s3,$t3,ror#8 eor $s3,$s3,$t3,ror#8
@ -333,11 +366,11 @@ _armv4_AES_encrypt:
and $i2,lr,$s2,lsr#16 @ i1 and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$i3,$t3,lsl#8 eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s2 and $i3,lr,$s2
eor $s1,$t1,$s1,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8] ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
eor $s1,$t1,$s1,lsl#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
mov $s2,$s2,lsr#24 mov $s2,$s2,lsr#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0] ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
eor $s0,$i1,$s0,lsl#8 eor $s0,$i1,$s0,lsl#8
ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24] ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
@ -346,15 +379,15 @@ _armv4_AES_encrypt:
and $i2,lr,$s3,lsr#8 @ i1 and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,lsl#8 eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s3,lsr#16 @ i2 and $i3,lr,$s3,lsr#16 @ i2
eor $s2,$t2,$s2,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0] ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
eor $s2,$t2,$s2,lsl#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
mov $s3,$s3,lsr#24 mov $s3,$s3,lsr#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16] ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
eor $s0,$i1,$s0,lsl#8 eor $s0,$i1,$s0,lsl#8
ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
ldr $i1,[$key,#0] ldr $i1,[$key,#0]
ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
eor $s1,$s1,$i2,lsl#8 eor $s1,$s1,$i2,lsl#8
ldr $t1,[$key,#4] ldr $t1,[$key,#4]
eor $s2,$s2,$i3,lsl#16 eor $s2,$s2,$i3,lsl#16
@ -398,6 +431,7 @@ AES_set_encrypt_key:
mov lr,r1 @ bits mov lr,r1 @ bits
mov $key,r2 @ key mov $key,r2 @ key
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner... ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1] ldrb $t2,[$rounds,#1]
@ -430,6 +464,22 @@ AES_set_encrypt_key:
orr $s3,$s3,$t3,lsl#24 orr $s3,$s3,$t3,lsl#24
str $s2,[$key,#-8] str $s2,[$key,#-8]
str $s3,[$key,#-4] str $s3,[$key,#-4]
#else
ldr $s0,[$rounds,#0]
ldr $s1,[$rounds,#4]
ldr $s2,[$rounds,#8]
ldr $s3,[$rounds,#12]
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
str $s0,[$key],#16
str $s1,[$key,#-12]
str $s2,[$key,#-8]
str $s3,[$key,#-4]
#endif
teq lr,#128 teq lr,#128
bne .Lnot128 bne .Lnot128
@ -466,6 +516,7 @@ AES_set_encrypt_key:
b .Ldone b .Ldone
.Lnot128: .Lnot128:
#if __ARM_ARCH__<7
ldrb $i2,[$rounds,#19] ldrb $i2,[$rounds,#19]
ldrb $t1,[$rounds,#18] ldrb $t1,[$rounds,#18]
ldrb $t2,[$rounds,#17] ldrb $t2,[$rounds,#17]
@ -482,6 +533,16 @@ AES_set_encrypt_key:
str $i2,[$key],#8 str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24 orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4] str $i3,[$key,#-4]
#else
ldr $i2,[$rounds,#16]
ldr $i3,[$rounds,#20]
#ifdef __ARMEL__
rev $i2,$i2
rev $i3,$i3
#endif
str $i2,[$key],#8
str $i3,[$key,#-4]
#endif
teq lr,#192 teq lr,#192
bne .Lnot192 bne .Lnot192
@ -526,6 +587,7 @@ AES_set_encrypt_key:
b .L192_loop b .L192_loop
.Lnot192: .Lnot192:
#if __ARM_ARCH__<7
ldrb $i2,[$rounds,#27] ldrb $i2,[$rounds,#27]
ldrb $t1,[$rounds,#26] ldrb $t1,[$rounds,#26]
ldrb $t2,[$rounds,#25] ldrb $t2,[$rounds,#25]
@ -542,6 +604,16 @@ AES_set_encrypt_key:
str $i2,[$key],#8 str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24 orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4] str $i3,[$key,#-4]
#else
ldr $i2,[$rounds,#24]
ldr $i3,[$rounds,#28]
#ifdef __ARMEL__
rev $i2,$i2
rev $i3,$i3
#endif
str $i2,[$key],#8
str $i3,[$key,#-4]
#endif
mov $rounds,#14 mov $rounds,#14
str $rounds,[$key,#240-32] str $rounds,[$key,#240-32]
@ -692,10 +764,14 @@ $code.=<<___;
bne .Lmix bne .Lmix
mov r0,#0 mov r0,#0
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4-r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
#endif
.size AES_set_decrypt_key,.-AES_set_decrypt_key .size AES_set_decrypt_key,.-AES_set_decrypt_key
.type AES_Td,%object .type AES_Td,%object
@ -811,7 +887,7 @@ AES_decrypt:
mov $rounds,r0 @ inp mov $rounds,r0 @ inp
mov $key,r2 mov $key,r2
sub $tbl,r3,#AES_decrypt-AES_Td @ Td sub $tbl,r3,#AES_decrypt-AES_Td @ Td
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner... ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1] ldrb $t2,[$rounds,#1]
@ -840,10 +916,33 @@ AES_decrypt:
orr $s3,$s3,$t1,lsl#8 orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16 orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24 orr $s3,$s3,$t3,lsl#24
#else
ldr $s0,[$rounds,#0]
ldr $s1,[$rounds,#4]
ldr $s2,[$rounds,#8]
ldr $s3,[$rounds,#12]
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
#endif
bl _armv4_AES_decrypt bl _armv4_AES_decrypt
ldr $rounds,[sp],#4 @ pop out ldr $rounds,[sp],#4 @ pop out
#if __ARM_ARCH__>=7
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
str $s0,[$rounds,#0]
str $s1,[$rounds,#4]
str $s2,[$rounds,#8]
str $s3,[$rounds,#12]
#else
mov $t1,$s0,lsr#24 @ write output in endian-neutral mov $t1,$s0,lsr#24 @ write output in endian-neutral
mov $t2,$s0,lsr#16 @ manner... mov $t2,$s0,lsr#16 @ manner...
mov $t3,$s0,lsr#8 mov $t3,$s0,lsr#8
@ -872,11 +971,15 @@ AES_decrypt:
strb $t2,[$rounds,#13] strb $t2,[$rounds,#13]
strb $t3,[$rounds,#14] strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15] strb $s3,[$rounds,#15]
#endif
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4-r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
#endif
.size AES_decrypt,.-AES_decrypt .size AES_decrypt,.-AES_decrypt
.type _armv4_AES_decrypt,%function .type _armv4_AES_decrypt,%function
@ -916,11 +1019,11 @@ _armv4_AES_decrypt:
and $i2,lr,$s2 @ i1 and $i2,lr,$s2 @ i1
eor $t3,$i3,$t3,ror#8 eor $t3,$i3,$t3,ror#8
and $i3,lr,$s2,lsr#16 and $i3,lr,$s2,lsr#16
eor $s1,$s1,$t1,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8] ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
eor $s1,$s1,$t1,ror#8
ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
mov $s2,$s2,lsr#24 mov $s2,$s2,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16] ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
eor $s0,$s0,$i1,ror#16 eor $s0,$s0,$i1,ror#16
ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24] ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
@ -929,22 +1032,22 @@ _armv4_AES_decrypt:
and $i2,lr,$s3,lsr#8 @ i1 and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,ror#8 eor $t3,$i3,$t3,ror#8
and $i3,lr,$s3 @ i2 and $i3,lr,$s3 @ i2
eor $s2,$s2,$t2,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16] ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
eor $s2,$s2,$t2,ror#8
ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
mov $s3,$s3,lsr#24 mov $s3,$s3,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0] ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
eor $s0,$s0,$i1,ror#8 eor $s0,$s0,$i1,ror#8
ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s1,$s1,$i2,ror#16
eor $s2,$s2,$i3,ror#24
ldr $i1,[$key],#16 ldr $i1,[$key],#16
eor $s3,$s3,$t3,ror#8 eor $s1,$s1,$i2,ror#16
ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s2,$s2,$i3,ror#24
ldr $t1,[$key,#-12] ldr $t1,[$key,#-12]
ldr $t2,[$key,#-8]
eor $s0,$s0,$i1 eor $s0,$s0,$i1
ldr $t2,[$key,#-8]
eor $s3,$s3,$t3,ror#8
ldr $t3,[$key,#-4] ldr $t3,[$key,#-4]
and $i1,lr,$s0,lsr#16 and $i1,lr,$s0,lsr#16
eor $s1,$s1,$t1 eor $s1,$s1,$t1
@ -985,11 +1088,11 @@ _armv4_AES_decrypt:
and $i1,lr,$s2,lsr#8 @ i0 and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$t2,$i2,lsl#8 eor $t2,$t2,$i2,lsl#8
and $i2,lr,$s2 @ i1 and $i2,lr,$s2 @ i1
eor $t3,$t3,$i3,lsl#8
ldrb $i1,[$tbl,$i1] @ Td4[s2>>8] ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
eor $t3,$t3,$i3,lsl#8
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
and $i3,lr,$s2,lsr#16 and $i3,lr,$s2,lsr#16
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
eor $s0,$s0,$i1,lsl#8 eor $s0,$s0,$i1,lsl#8
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
@ -997,11 +1100,11 @@ _armv4_AES_decrypt:
and $i1,lr,$s3,lsr#16 @ i0 and $i1,lr,$s3,lsr#16 @ i0
eor $s2,$t2,$s2,lsl#16 eor $s2,$t2,$s2,lsl#16
and $i2,lr,$s3,lsr#8 @ i1 and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$t3,$i3,lsl#16
ldrb $i1,[$tbl,$i1] @ Td4[s3>>16] ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
eor $t3,$t3,$i3,lsl#16
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
and $i3,lr,$s3 @ i2 and $i3,lr,$s3 @ i2
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
eor $s0,$s0,$i1,lsl#16 eor $s0,$s0,$i1,lsl#16

View File

@ -57,7 +57,9 @@ ghash-parisc.s: asm/ghash-parisc.pl
$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@ $(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
# GNU make "catch all" # GNU make "catch all"
ghash-%.s: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
ghash-armv4.o: ghash-armv4.S
files: files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO

View File

@ -25,6 +25,18 @@
# Cortex A8 core and ~25 cycles per processed byte (which was observed # Cortex A8 core and ~25 cycles per processed byte (which was observed
# to be ~3 times faster than gcc-generated code:-) # to be ~3 times faster than gcc-generated code:-)
# #
# February 2011
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Cortex A8 core and ~23.5 cycles per byte.
#
# March 2011
#
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.
# ====================================================================
# Note about "528B" variant. In ARM case it makes lesser sense to # Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons: # implement it for following reasons:
# #
@ -52,6 +64,7 @@ $Xi="r0"; # argument block
$Htbl="r1"; $Htbl="r1";
$inp="r2"; $inp="r2";
$len="r3"; $len="r3";
$Zll="r4"; # variables $Zll="r4"; # variables
$Zlh="r5"; $Zlh="r5";
$Zhl="r6"; $Zhl="r6";
@ -72,8 +85,13 @@ sub Zsmash() {
my $i=12; my $i=12;
my @args=@_; my @args=@_;
for ($Zll,$Zlh,$Zhl,$Zhh) { for ($Zll,$Zlh,$Zhl,$Zhh) {
# can be reduced to single "str $_,[$Xi,$i]" on big-endian platforms
$code.=<<___; $code.=<<___;
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev $_,$_
str $_,[$Xi,#$i]
#elif defined(__ARMEB__)
str $_,[$Xi,#$i]
#else
mov $Tlh,$_,lsr#8 mov $Tlh,$_,lsr#8
strb $_,[$Xi,#$i+3] strb $_,[$Xi,#$i+3]
mov $Thl,$_,lsr#16 mov $Thl,$_,lsr#16
@ -81,6 +99,7 @@ sub Zsmash() {
mov $Thh,$_,lsr#24 mov $Thh,$_,lsr#24
strb $Thl,[$Xi,#$i+1] strb $Thl,[$Xi,#$i+1]
strb $Thh,[$Xi,#$i] strb $Thh,[$Xi,#$i]
#endif
___ ___
$code.="\t".shift(@args)."\n"; $code.="\t".shift(@args)."\n";
$i-=4; $i-=4;
@ -88,6 +107,8 @@ ___
} }
$code=<<___; $code=<<___;
#include "arm_arch.h"
.text .text
.code 32 .code 32
@ -149,41 +170,41 @@ gcm_ghash_4bit:
and $nlo,$nlo,#0x0f and $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16 eor $Zhh,$Zhh,$Tll,lsl#16
.Loop: .Linner:
add $Thh,$Htbl,$nlo,lsl#4 add $Thh,$Htbl,$nlo,lsl#4
subs $cnt,$cnt,#1
and $nlo,$Zll,#0xf @ rem and $nlo,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] subs $cnt,$cnt,#1
add $nlo,$nlo,$nlo add $nlo,$nlo,$nlo
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
eor $Zll,$Tll,$Zll,lsr#4 eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28 eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4 eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28 eor $Zlh,$Zlh,$Zhl,lsl#28
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr#4 eor $Zhl,$Thl,$Zhl,lsr#4
ldrplb $nlo,[$inp,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4 eor $Zhh,$Thh,$Zhh,lsr#4
ldrplb $nlo,[$inp,$cnt]
add $Thh,$Htbl,$nhi add $Thh,$Htbl,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
and $nhi,$Zll,#0xf @ rem and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
add $nhi,$nhi,$nhi add $nhi,$nhi,$nhi
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zll,$Tll,$Zll,lsr#4 eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] ldrplb $Tll,[$Xi,$cnt]
eor $Zll,$Zll,$Zlh,lsl#28 eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4 eor $Zlh,$Tlh,$Zlh,lsr#4
ldrplb $nhi,[$Xi,$cnt] ldrh $Tlh,[sp,$nhi]
eor $Zlh,$Zlh,$Zhl,lsl#28 eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4 eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhl,$Zhl,$Zhh,lsl#28
eorpl $nlo,$nlo,$nhi eorpl $nlo,$nlo,$Tll
eor $Zhh,$Thh,$Zhh,lsr#4 eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0 andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop bpl .Linner
ldr $len,[sp,#32] @ re-load $len/end ldr $len,[sp,#32] @ re-load $len/end
add $inp,$inp,#16 add $inp,$inp,#16
@ -194,10 +215,14 @@ $code.=<<___;
bne .Louter bne .Louter
add sp,sp,#36 add sp,sp,#36
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr} ldmia sp!,{r4-r11,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
#endif
.size gcm_ghash_4bit,.-gcm_ghash_4bit .size gcm_ghash_4bit,.-gcm_ghash_4bit
.global gcm_gmult_4bit .global gcm_gmult_4bit
@ -231,31 +256,31 @@ gcm_gmult_4bit:
eor $Zhh,$Zhh,$Tll,lsl#16 eor $Zhh,$Zhh,$Tll,lsl#16
and $nlo,$nlo,#0x0f and $nlo,$nlo,#0x0f
.Loop2: .Loop:
add $Thh,$Htbl,$nlo,lsl#4 add $Thh,$Htbl,$nlo,lsl#4
subs $cnt,$cnt,#1
and $nlo,$Zll,#0xf @ rem and $nlo,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] subs $cnt,$cnt,#1
add $nlo,$nlo,$nlo add $nlo,$nlo,$nlo
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
eor $Zll,$Tll,$Zll,lsr#4 eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28 eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4 eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28 eor $Zlh,$Zlh,$Zhl,lsl#28
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr#4 eor $Zhl,$Thl,$Zhl,lsr#4
ldrplb $nlo,[$Xi,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4 eor $Zhh,$Thh,$Zhh,lsr#4
ldrplb $nlo,[$Xi,$cnt]
add $Thh,$Htbl,$nhi add $Thh,$Htbl,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
and $nhi,$Zll,#0xf @ rem and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
add $nhi,$nhi,$nhi add $nhi,$nhi,$nhi
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zll,$Tll,$Zll,lsr#4 eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28 eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4 eor $Zlh,$Tlh,$Zlh,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zlh,$Zlh,$Zhl,lsl#28 eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4 eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhl,$Zhl,$Zhh,lsl#28
@ -263,16 +288,138 @@ gcm_gmult_4bit:
andpl $nhi,$nlo,#0xf0 andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop2 bpl .Loop
___ ___
&Zsmash(); &Zsmash();
$code.=<<___; $code.=<<___;
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr} ldmia sp!,{r4-r11,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
#endif
.size gcm_gmult_4bit,.-gcm_gmult_4bit .size gcm_gmult_4bit,.-gcm_gmult_4bit
.asciz "GHASH for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" ___
{
my $cnt=$Htbl; # $Htbl is used once in the very beginning
my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
# in Zo. Or should I say "top bit", because GHASH is specified in
# reverse bit order? Otherwise straightforward 128-bt H by one input
# byte multiplication and modulo-reduction, times 16.
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
$code.=<<___;
#if __ARM_ARCH__>=7
.fpu neon
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
sub $Htbl,#16 @ point at H in GCM128_CTX
vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
vshr.u64 $mod,#32
vldmia $Htbl,{$Hhi-$Hlo} @ load H
veor $zero,$zero
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
veor $Qpost,$Qpost
veor $R,$R
mov $cnt,#16
veor $Z,$Z
mov $len,#16
veor $Zo,$Zo
vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
b .Linner_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
vshr.u64 $mod,#32
vldmia $Xi,{$Hhi-$Hlo} @ load H
veor $zero,$zero
nop
#ifdef __ARMEL__
vrev64.8 $Z,$Z
#endif
.Louter_neon:
vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
veor $Qpost,$Qpost
vld1.64 `&Dlo($IN)`,[$inp]!
veor $R,$R
mov $cnt,#16
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
veor $Zo,$Zo
veor $IN,$Z @ inp^=Xi
veor $Z,$Z
vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
.Linner_neon:
subs $cnt,$cnt,#1
vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
vext.8 $IN,$zero,#1 @ IN>>=8
veor $Z,$Qpost @ modulo-scheduled part
vshl.i64 `&Dlo("$R")`,#48
vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
veor `&Dhi("$Z")`,`&Dlo("$R")`
vuzp.8 $Qlo,$Qhi
vsli.8 $Zo,$T,#1 @ compose the "carry" byte
vext.8 $Z,$zero,#1 @ Z>>=8
vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
veor $Z,$Qhi
bne .Linner_neon
veor $Z,$Qpost @ modulo-scheduled artefact
vshl.i64 `&Dlo("$R")`,#48
veor `&Dhi("$Z")`,`&Dlo("$R")`
@ finalization, normalize Z:Zo
vand $Zo,$mod @ suffices to mask the bit
vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
vshl.i64 $Z,#1
subs $len,#16
vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
bne .Louter_neon
#ifdef __ARMEL__
vrev64.8 $Z,$Z
#endif
sub $Xi,#16
vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
vst1.64 `&Dlo("$Z")`,[$Xi,:64]
bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
}
$code.=<<___;
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2 .align 2
___ ___

View File

@ -642,27 +642,38 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
#endif #endif
#if TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \ #if TABLE_BITS==4 && defined(GHASH_ASM)
# if !defined(I386_ONLY) && \
(defined(__i386) || defined(__i386__) || \ (defined(__i386) || defined(__i386__) || \
defined(__x86_64) || defined(__x86_64__) || \ defined(__x86_64) || defined(__x86_64__) || \
defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
# define GHASH_ASM_X86_OR_64 # define GHASH_ASM_X86_OR_64
# define GCM_FUNCREF_4BIT
extern unsigned int OPENSSL_ia32cap_P[2]; extern unsigned int OPENSSL_ia32cap_P[2];
void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]); void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]); void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
# if defined(__i386) || defined(__i386__) || defined(_M_IX86) # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
# define GHASH_ASM_X86 # define GHASH_ASM_X86
void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]); void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
# endif # endif
# elif defined(__arm__) || defined(__arm)
# include "arm_arch.h"
# if __ARM_ARCH__>=7
# define GHASH_ASM_ARM
# define GCM_FUNCREF_4BIT
extern unsigned int OPENSSL_armcap;
# define GCM_FUNCREF_4BIT void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
# endif
# endif
#endif #endif
void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
@ -715,6 +726,15 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
ctx->gmult = gcm_gmult_4bit; ctx->gmult = gcm_gmult_4bit;
ctx->ghash = gcm_ghash_4bit; ctx->ghash = gcm_ghash_4bit;
# endif # endif
# elif defined(GHASH_ASM_ARM)
if (OPENSSL_armcap & 1) {
ctx->gmult = gcm_gmult_neon;
ctx->ghash = gcm_ghash_neon;
} else {
gcm_init_4bit(ctx->Htable,ctx->H.u);
ctx->gmult = gcm_gmult_4bit;
ctx->ghash = gcm_ghash_4bit;
}
# else # else
gcm_init_4bit(ctx->Htable,ctx->H.u); gcm_init_4bit(ctx->Htable,ctx->H.u);
# endif # endif

View File

@ -56,8 +56,8 @@ sha256-ia64.s: asm/sha512-ia64.pl
sha512-ia64.s: asm/sha512-ia64.pl sha512-ia64.s: asm/sha512-ia64.pl
(cd asm; $(PERL) sha512-ia64.pl ../$@ $(CFLAGS)) (cd asm; $(PERL) sha512-ia64.pl ../$@ $(CFLAGS))
sha256-armv4.s: asm/sha256-armv4.pl sha256-armv4.S: asm/sha256-armv4.pl
$(PERL) $< $@ $(PERL) $< $(PERLASM_SCHEME) $@
sha1-alpha.s: asm/sha1-alpha.pl sha1-alpha.s: asm/sha1-alpha.pl
$(PERL) $< | $(CC) -E - | tee $@ > /dev/null $(PERL) $< | $(CC) -E - | tee $@ > /dev/null
@ -83,9 +83,13 @@ sha256-mips.s: asm/sha512-mips.pl; $(PERL) asm/sha512-mips.pl $(PERLASM_SCHEME)
sha512-mips.s: asm/sha512-mips.pl; $(PERL) asm/sha512-mips.pl $(PERLASM_SCHEME) $@ sha512-mips.s: asm/sha512-mips.pl; $(PERL) asm/sha512-mips.pl $(PERLASM_SCHEME) $@
# GNU make "catch all" # GNU make "catch all"
sha1-%.s: asm/sha1-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ sha1-%.S: asm/sha1-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
sha256-%.s: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ sha256-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
sha512-%.s: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ sha512-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
sha1-armv4-large.o: sha1-armv4-large.S
sha256-armv4.o: sha256-armv4.S
sha512-armv4.o: sha512-armv4.S
files: files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO

View File

@ -47,6 +47,10 @@
# Cortex A8 core and in absolute terms ~870 cycles per input block # Cortex A8 core and in absolute terms ~870 cycles per input block
# [or 13.6 cycles per byte]. # [or 13.6 cycles per byte].
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output"; open STDOUT,">$output";
@ -76,31 +80,41 @@ $code.=<<___;
add $e,$K,$e,ror#2 @ E+=K_xx_xx add $e,$K,$e,ror#2 @ E+=K_xx_xx
ldr $t3,[$Xi,#2*4] ldr $t3,[$Xi,#2*4]
eor $t0,$t0,$t1 eor $t0,$t0,$t1
eor $t2,$t2,$t3 eor $t2,$t2,$t3 @ 1 cycle stall
eor $t1,$c,$d @ F_xx_xx eor $t1,$c,$d @ F_xx_xx
mov $t0,$t0,ror#31 mov $t0,$t0,ror#31
add $e,$e,$a,ror#27 @ E+=ROR(A,27) add $e,$e,$a,ror#27 @ E+=ROR(A,27)
eor $t0,$t0,$t2,ror#31 eor $t0,$t0,$t2,ror#31
str $t0,[$Xi,#-4]!
$opt1 @ F_xx_xx $opt1 @ F_xx_xx
$opt2 @ F_xx_xx $opt2 @ F_xx_xx
add $e,$e,$t0 @ E+=X[i] add $e,$e,$t0 @ E+=X[i]
str $t0,[$Xi,#-4]!
___ ___
} }
sub BODY_00_15 { sub BODY_00_15 {
my ($a,$b,$c,$d,$e)=@_; my ($a,$b,$c,$d,$e)=@_;
$code.=<<___; $code.=<<___;
#if __ARM_ARCH__<7
ldrb $t1,[$inp,#2] ldrb $t1,[$inp,#2]
ldrb $t0,[$inp,#3] ldrb $t0,[$inp,#3]
ldrb $t2,[$inp,#1] ldrb $t2,[$inp,#1]
add $e,$K,$e,ror#2 @ E+=K_00_19 add $e,$K,$e,ror#2 @ E+=K_00_19
ldrb $t3,[$inp],#4 ldrb $t3,[$inp],#4
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
orr $t0,$t0,$t1,lsl#8 orr $t0,$t0,$t1,lsl#8
eor $t1,$c,$d @ F_xx_xx eor $t1,$c,$d @ F_xx_xx
orr $t0,$t0,$t2,lsl#16 orr $t0,$t0,$t2,lsl#16
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
orr $t0,$t0,$t3,lsl#24 orr $t0,$t0,$t3,lsl#24
#else
ldr $t0,[$inp],#4 @ handles unaligned
add $e,$K,$e,ror#2 @ E+=K_00_19
eor $t1,$c,$d @ F_xx_xx
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev $t0,$t0 @ byte swap
#endif
#endif
and $t1,$b,$t1,ror#2 and $t1,$b,$t1,ror#2
add $e,$e,$t0 @ E+=X[i] add $e,$e,$t0 @ E+=X[i]
eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
@ -136,6 +150,8 @@ ___
} }
$code=<<___; $code=<<___;
#include "arm_arch.h"
.text .text
.global sha1_block_data_order .global sha1_block_data_order
@ -209,10 +225,14 @@ $code.=<<___;
teq $inp,$len teq $inp,$len
bne .Lloop @ [+18], total 1307 bne .Lloop @ [+18], total 1307
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4-r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
#endif
.align 2 .align 2
.LK_00_19: .word 0x5a827999 .LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1 .LK_20_39: .word 0x6ed9eba1

View File

@ -18,11 +18,16 @@
# Rescheduling for dual-issue pipeline resulted in 22% improvement on # Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte. # Cortex A8 core and ~20 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~17 cycles per processed byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output"; open STDOUT,">$output";
$ctx="r0"; $t0="r0"; $ctx="r0"; $t0="r0";
$inp="r1"; $inp="r1"; $t3="r1";
$len="r2"; $t1="r2"; $len="r2"; $t1="r2";
$T1="r3"; $T1="r3";
$A="r4"; $A="r4";
@ -46,6 +51,9 @@ sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16); $code.=<<___ if ($i<16);
#if __ARM_ARCH__>=7
ldr $T1,[$inp],#4
#else
ldrb $T1,[$inp,#3] @ $i ldrb $T1,[$inp,#3] @ $i
ldrb $t2,[$inp,#2] ldrb $t2,[$inp,#2]
ldrb $t1,[$inp,#1] ldrb $t1,[$inp,#1]
@ -53,16 +61,24 @@ $code.=<<___ if ($i<16);
orr $T1,$T1,$t2,lsl#8 orr $T1,$T1,$t2,lsl#8
orr $T1,$T1,$t1,lsl#16 orr $T1,$T1,$t1,lsl#16
orr $T1,$T1,$t0,lsl#24 orr $T1,$T1,$t0,lsl#24
`"str $inp,[sp,#17*4]" if ($i==15)` #endif
___ ___
$code.=<<___; $code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++
mov $t0,$e,ror#$Sigma1[0] mov $t0,$e,ror#$Sigma1[0]
str $T1,[sp,#`$i%16`*4] ldr $t2,[$Ktbl],#4 @ *K256++
eor $t0,$t0,$e,ror#$Sigma1[1] eor $t0,$t0,$e,ror#$Sigma1[1]
eor $t1,$f,$g eor $t1,$f,$g
#if $i>=16
add $T1,$T1,$t3 @ from BODY_16_xx
#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
rev $T1,$T1
#endif
#if $i==15
str $inp,[sp,#17*4] @ leave room for $t3
#endif
eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
and $t1,$t1,$e and $t1,$t1,$e
str $T1,[sp,#`$i%16`*4]
add $T1,$T1,$t0 add $T1,$T1,$t0
eor $t1,$t1,$g @ Ch(e,f,g) eor $t1,$t1,$g @ Ch(e,f,g)
add $T1,$T1,$h add $T1,$T1,$h
@ -71,6 +87,9 @@ $code.=<<___;
eor $h,$h,$a,ror#$Sigma0[1] eor $h,$h,$a,ror#$Sigma0[1]
add $T1,$T1,$t2 add $T1,$T1,$t2
eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
#if $i>=15
ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
#endif
orr $t0,$a,$b orr $t0,$a,$b
and $t1,$a,$b and $t1,$a,$b
and $t0,$t0,$c and $t0,$t0,$c
@ -85,24 +104,26 @@ sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___; $code.=<<___;
ldr $t1,[sp,#`($i+1)%16`*4] @ $i @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
ldr $t2,[sp,#`($i+14)%16`*4] ldr $t2,[sp,#`($i+14)%16`*4]
mov $t0,$t3,ror#$sigma0[0]
ldr $T1,[sp,#`($i+0)%16`*4] ldr $T1,[sp,#`($i+0)%16`*4]
mov $t0,$t1,ror#$sigma0[0] eor $t0,$t0,$t3,ror#$sigma0[1]
ldr $inp,[sp,#`($i+9)%16`*4] ldr $t1,[sp,#`($i+9)%16`*4]
eor $t0,$t0,$t1,ror#$sigma0[1] eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) mov $t3,$t2,ror#$sigma1[0]
mov $t1,$t2,ror#$sigma1[0]
add $T1,$T1,$t0 add $T1,$T1,$t0
eor $t1,$t1,$t2,ror#$sigma1[1] eor $t3,$t3,$t2,ror#$sigma1[1]
add $T1,$T1,$inp
eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
add $T1,$T1,$t1 add $T1,$T1,$t1
eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
@ add $T1,$T1,$t3
___ ___
&BODY_00_15(@_); &BODY_00_15(@_);
} }
$code=<<___; $code=<<___;
#include "arm_arch.h"
.text .text
.code 32 .code 32
@ -132,7 +153,7 @@ K256:
sha256_block_data_order: sha256_block_data_order:
sub r3,pc,#8 @ sha256_block_data_order sub r3,pc,#8 @ sha256_block_data_order
add $len,$inp,$len,lsl#6 @ len to point at the end of inp add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r12,lr} stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
sub $Ktbl,r3,#256 @ K256 sub $Ktbl,r3,#256 @ K256
sub sp,sp,#16*4 @ alloca(X[16]) sub sp,sp,#16*4 @ alloca(X[16])
@ -171,10 +192,14 @@ $code.=<<___;
bne .Loop bne .Loop
add sp,sp,#`16+3`*4 @ destroy frame add sp,sp,#`16+3`*4 @ destroy frame
ldmia sp!,{r4-r12,lr} #if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha256_block_data_order,.-sha256_block_data_order .size sha256_block_data_order,.-sha256_block_data_order
.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align 2 .align 2

View File

@ -18,22 +18,33 @@
# Rescheduling for dual-issue pipeline resulted in 6% improvement on # Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte. # Cortex A8 core and ~40 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Coxtex A8 core and ~38 cycles per byte.
# March 2011.
#
# Add NEON implementation. On Cortex A8 it was measured to process
# one byte in 25.5 cycles or 47% faster than integer-only code.
# Byte order [in]dependence. ========================================= # Byte order [in]dependence. =========================================
# #
# Caller is expected to maintain specific *dword* order in h[0-7], # Originally caller was expected to maintain specific *dword* order in
# namely with most significant dword at *lower* address, which is # h[0-7], namely with most significant dword at *lower* address, which
# reflected in below two parameters. *Byte* order within these dwords # was reflected in below two parameters as 0 and 4. Now caller is
# in turn is whatever *native* byte order on current platform. # expected to maintain native byte order for whole 64-bit values.
$hi=0; $hi="HI";
$lo=4; $lo="LO";
# ==================================================================== # ====================================================================
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output"; open STDOUT,">$output";
$ctx="r0"; $ctx="r0"; # parameter block
$inp="r1"; $inp="r1";
$len="r2"; $len="r2";
$Tlo="r3"; $Tlo="r3";
$Thi="r4"; $Thi="r4";
$Alo="r5"; $Alo="r5";
@ -61,15 +72,17 @@ $Xoff=8*8;
sub BODY_00_15() { sub BODY_00_15() {
my $magic = shift; my $magic = shift;
$code.=<<___; $code.=<<___;
ldr $t2,[sp,#$Hoff+0] @ h.lo
ldr $t3,[sp,#$Hoff+4] @ h.hi
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
mov $t0,$Elo,lsr#14 mov $t0,$Elo,lsr#14
str $Tlo,[sp,#$Xoff+0]
mov $t1,$Ehi,lsr#14 mov $t1,$Ehi,lsr#14
str $Thi,[sp,#$Xoff+4]
eor $t0,$t0,$Ehi,lsl#18 eor $t0,$t0,$Ehi,lsl#18
ldr $t2,[sp,#$Hoff+0] @ h.lo
eor $t1,$t1,$Elo,lsl#18 eor $t1,$t1,$Elo,lsl#18
ldr $t3,[sp,#$Hoff+4] @ h.hi
eor $t0,$t0,$Elo,lsr#18 eor $t0,$t0,$Elo,lsr#18
eor $t1,$t1,$Ehi,lsr#18 eor $t1,$t1,$Ehi,lsr#18
eor $t0,$t0,$Ehi,lsl#14 eor $t0,$t0,$Ehi,lsl#14
@ -96,25 +109,24 @@ $code.=<<___;
and $t1,$t1,$Ehi and $t1,$t1,$Ehi
str $Ahi,[sp,#$Aoff+4] str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2 eor $t0,$t0,$t2
ldr $t2,[$Ktbl,#4] @ K[i].lo ldr $t2,[$Ktbl,#$lo] @ K[i].lo
eor $t1,$t1,$t3 @ Ch(e,f,g) eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#0] @ K[i].hi ldr $t3,[$Ktbl,#$hi] @ K[i].hi
adds $Tlo,$Tlo,$t0 adds $Tlo,$Tlo,$t0
ldr $Elo,[sp,#$Doff+0] @ d.lo ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
ldr $Ehi,[sp,#$Doff+4] @ d.hi ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2 adds $Tlo,$Tlo,$t2
and $t0,$t2,#0xff
adc $Thi,$Thi,$t3 @ T += K[i] adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo adds $Elo,$Elo,$Tlo
adc $Ehi,$Ehi,$Thi @ d += T
and $t0,$t2,#0xff
teq $t0,#$magic
orreq $Ktbl,$Ktbl,#1
ldr $t2,[sp,#$Boff+0] @ b.lo ldr $t2,[sp,#$Boff+0] @ b.lo
adc $Ehi,$Ehi,$Thi @ d += T
teq $t0,#$magic
ldr $t3,[sp,#$Coff+0] @ c.lo ldr $t3,[sp,#$Coff+0] @ c.lo
orreq $Ktbl,$Ktbl,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@ -131,80 +143,100 @@ $code.=<<___;
eor $t0,$t0,$Alo,lsl#25 eor $t0,$t0,$Alo,lsl#25
eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
adds $Tlo,$Tlo,$t0 adds $Tlo,$Tlo,$t0
and $t0,$Alo,$t2
adc $Thi,$Thi,$t1 @ T += Sigma0(a) adc $Thi,$Thi,$t1 @ T += Sigma0(a)
and $t0,$Alo,$t2
orr $Alo,$Alo,$t2
ldr $t1,[sp,#$Boff+4] @ b.hi ldr $t1,[sp,#$Boff+4] @ b.hi
orr $Alo,$Alo,$t2
ldr $t2,[sp,#$Coff+4] @ c.hi ldr $t2,[sp,#$Coff+4] @ c.hi
and $Alo,$Alo,$t3 and $Alo,$Alo,$t3
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $t3,$Ahi,$t1 and $t3,$Ahi,$t1
orr $Ahi,$Ahi,$t1 orr $Ahi,$Ahi,$t1
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $Ahi,$Ahi,$t2 and $Ahi,$Ahi,$t2
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
adds $Alo,$Alo,$Tlo adds $Alo,$Alo,$Tlo
adc $Ahi,$Ahi,$Thi @ h += T orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
sub sp,sp,#8 sub sp,sp,#8
adc $Ahi,$Ahi,$Thi @ h += T
tst $Ktbl,#1
add $Ktbl,$Ktbl,#8 add $Ktbl,$Ktbl,#8
___ ___
} }
$code=<<___; $code=<<___;
#include "arm_arch.h"
#ifdef __ARMEL__
# define LO 0
# define HI 4
# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
#else
# define HI 0
# define LO 4
# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
#endif
.text .text
.code 32 .code 32
.type K512,%object .type K512,%object
.align 5 .align 5
K512: K512:
.word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
.word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
.word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
.word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
.word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
.word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
.word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
.word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
.word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
.word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
.word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
.word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
.word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
.word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
.word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
.word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
.word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
.word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
.word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
.word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
.word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
.word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
.word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
.word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
.word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
.word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
.word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
.word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
.word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
.word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
.word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
.word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
.word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
.word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
.word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
.word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
.word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
.word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
.word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
.word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512 .size K512,.-K512
.LOPENSSL_armcap:
.word OPENSSL_armcap-sha512_block_data_order
.skip 32-4
.global sha512_block_data_order .global sha512_block_data_order
.type sha512_block_data_order,%function .type sha512_block_data_order,%function
sha512_block_data_order: sha512_block_data_order:
sub r3,pc,#8 @ sha512_block_data_order sub r3,pc,#8 @ sha512_block_data_order
add $len,$inp,$len,lsl#7 @ len to point at the end of inp add $len,$inp,$len,lsl#7 @ len to point at the end of inp
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap
tst r12,#1
bne .LNEON
#endif
stmdb sp!,{r4-r12,lr} stmdb sp!,{r4-r12,lr}
sub $Ktbl,r3,#640 @ K512 sub $Ktbl,r3,#672 @ K512
sub sp,sp,#9*8 sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo] ldr $Elo,[$ctx,#$Eoff+$lo]
@ -238,6 +270,7 @@ sha512_block_data_order:
str $Thi,[sp,#$Foff+4] str $Thi,[sp,#$Foff+4]
.L00_15: .L00_15:
#if __ARM_ARCH__<7
ldrb $Tlo,[$inp,#7] ldrb $Tlo,[$inp,#7]
ldrb $t0, [$inp,#6] ldrb $t0, [$inp,#6]
ldrb $t1, [$inp,#5] ldrb $t1, [$inp,#5]
@ -252,26 +285,30 @@ sha512_block_data_order:
orr $Thi,$Thi,$t3,lsl#8 orr $Thi,$Thi,$t3,lsl#8
orr $Thi,$Thi,$t0,lsl#16 orr $Thi,$Thi,$t0,lsl#16
orr $Thi,$Thi,$t1,lsl#24 orr $Thi,$Thi,$t1,lsl#24
str $Tlo,[sp,#$Xoff+0] #else
str $Thi,[sp,#$Xoff+4] ldr $Tlo,[$inp,#4]
ldr $Thi,[$inp],#8
#ifdef __ARMEL__
rev $Tlo,$Tlo
rev $Thi,$Thi
#endif
#endif
___ ___
&BODY_00_15(0x94); &BODY_00_15(0x94);
$code.=<<___; $code.=<<___;
tst $Ktbl,#1 tst $Ktbl,#1
beq .L00_15 beq .L00_15
bic $Ktbl,$Ktbl,#1
.L16_79:
ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] bic $Ktbl,$Ktbl,#1
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] .L16_79:
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
mov $Tlo,$t0,lsr#1 mov $Tlo,$t0,lsr#1
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
mov $Thi,$t1,lsr#1 mov $Thi,$t1,lsr#1
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
eor $Tlo,$Tlo,$t1,lsl#31 eor $Tlo,$Tlo,$t1,lsl#31
eor $Thi,$Thi,$t0,lsl#31 eor $Thi,$Thi,$t0,lsl#31
eor $Tlo,$Tlo,$t0,lsr#8 eor $Tlo,$Tlo,$t0,lsr#8
@ -295,25 +332,24 @@ $code.=<<___;
eor $t1,$t1,$t3,lsl#3 eor $t1,$t1,$t3,lsl#3
eor $t0,$t0,$t2,lsr#6 eor $t0,$t0,$t2,lsr#6
eor $t1,$t1,$t3,lsr#6 eor $t1,$t1,$t3,lsr#6
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
eor $t0,$t0,$t3,lsl#26 eor $t0,$t0,$t3,lsl#26
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
adds $Tlo,$Tlo,$t0 adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#`$Xoff+8*16`+0]
adc $Thi,$Thi,$t1 adc $Thi,$Thi,$t1
ldr $t0,[sp,#`$Xoff+8*16`+0]
ldr $t1,[sp,#`$Xoff+8*16`+4] ldr $t1,[sp,#`$Xoff+8*16`+4]
adds $Tlo,$Tlo,$t2 adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3 adc $Thi,$Thi,$t3
adds $Tlo,$Tlo,$t0 adds $Tlo,$Tlo,$t0
adc $Thi,$Thi,$t1 adc $Thi,$Thi,$t1
str $Tlo,[sp,#$Xoff+0]
str $Thi,[sp,#$Xoff+4]
___ ___
&BODY_00_15(0x17); &BODY_00_15(0x17);
$code.=<<___; $code.=<<___;
tst $Ktbl,#1 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
beq .L16_79 beq .L16_79
bic $Ktbl,$Ktbl,#1 bic $Ktbl,$Ktbl,#1
@ -324,12 +360,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Boff+$lo] ldr $t2, [$ctx,#$Boff+$lo]
ldr $t3, [$ctx,#$Boff+$hi] ldr $t3, [$ctx,#$Boff+$hi]
adds $t0,$Alo,$t0 adds $t0,$Alo,$t0
adc $t1,$Ahi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $t0, [$ctx,#$Aoff+$lo] str $t0, [$ctx,#$Aoff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Aoff+$hi] str $t1, [$ctx,#$Aoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Boff+$lo] str $t2, [$ctx,#$Boff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Boff+$hi] str $t3, [$ctx,#$Boff+$hi]
ldr $Alo,[sp,#$Coff+0] ldr $Alo,[sp,#$Coff+0]
@ -341,12 +377,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Doff+$lo] ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi] ldr $t3, [$ctx,#$Doff+$hi]
adds $t0,$Alo,$t0 adds $t0,$Alo,$t0
adc $t1,$Ahi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $t0, [$ctx,#$Coff+$lo] str $t0, [$ctx,#$Coff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Coff+$hi] str $t1, [$ctx,#$Coff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Doff+$lo] str $t2, [$ctx,#$Doff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Doff+$hi] str $t3, [$ctx,#$Doff+$hi]
ldr $Tlo,[sp,#$Foff+0] ldr $Tlo,[sp,#$Foff+0]
@ -356,12 +392,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Foff+$lo] ldr $t2, [$ctx,#$Foff+$lo]
ldr $t3, [$ctx,#$Foff+$hi] ldr $t3, [$ctx,#$Foff+$hi]
adds $Elo,$Elo,$t0 adds $Elo,$Elo,$t0
adc $Ehi,$Ehi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $Elo,[$ctx,#$Eoff+$lo] str $Elo,[$ctx,#$Eoff+$lo]
adc $Ehi,$Ehi,$t1
str $Ehi,[$ctx,#$Eoff+$hi] str $Ehi,[$ctx,#$Eoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Foff+$lo] str $t2, [$ctx,#$Foff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Foff+$hi] str $t3, [$ctx,#$Foff+$hi]
ldr $Alo,[sp,#$Goff+0] ldr $Alo,[sp,#$Goff+0]
@ -373,12 +409,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Hoff+$lo] ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi] ldr $t3, [$ctx,#$Hoff+$hi]
adds $t0,$Alo,$t0 adds $t0,$Alo,$t0
adc $t1,$Ahi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $t0, [$ctx,#$Goff+$lo] str $t0, [$ctx,#$Goff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Goff+$hi] str $t1, [$ctx,#$Goff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Hoff+$lo] str $t2, [$ctx,#$Hoff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Hoff+$hi] str $t3, [$ctx,#$Hoff+$hi]
add sp,sp,#640 add sp,sp,#640
@ -388,13 +424,156 @@ $code.=<<___;
bne .Loop bne .Loop
add sp,sp,#8*9 @ destroy frame add sp,sp,#8*9 @ destroy frame
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4-r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
.size sha512_block_data_order,.-sha512_block_data_order #endif
.asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" ___
{
my @Sigma0=(28,34,39);
my @Sigma1=(14,18,41);
my @sigma0=(1, 8, 7);
my @sigma1=(19,61,6);
my $Ktbl="r3";
my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
my @X=map("d$_",(0..15));
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
sub NEON_00_15() {
my $i=shift;
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
$code.=<<___ if ($i<16 || $i&1);
vshr.u64 $t0,$e,#@Sigma1[0] @ $i
#if $i<16
vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
#endif
vshr.u64 $t1,$e,#@Sigma1[1]
vshr.u64 $t2,$e,#@Sigma1[2]
___
$code.=<<___;
vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
vsli.64 $t0,$e,#`64-@Sigma1[0]`
vsli.64 $t1,$e,#`64-@Sigma1[1]`
vsli.64 $t2,$e,#`64-@Sigma1[2]`
#if $i<16 && defined(__ARMEL__)
vrev64.8 @X[$i],@X[$i]
#endif
vadd.i64 $T1,$K,$h
veor $Ch,$f,$g
veor $t0,$t1
vand $Ch,$e
veor $t0,$t2 @ Sigma1(e)
veor $Ch,$g @ Ch(e,f,g)
vadd.i64 $T1,$t0
vshr.u64 $t0,$a,#@Sigma0[0]
vadd.i64 $T1,$Ch
vshr.u64 $t1,$a,#@Sigma0[1]
vshr.u64 $t2,$a,#@Sigma0[2]
vsli.64 $t0,$a,#`64-@Sigma0[0]`
vsli.64 $t1,$a,#`64-@Sigma0[1]`
vsli.64 $t2,$a,#`64-@Sigma0[2]`
vadd.i64 $T1,@X[$i%16]
vorr $Maj,$a,$c
vand $Ch,$a,$c
veor $h,$t0,$t1
vand $Maj,$b
veor $h,$t2 @ Sigma0(a)
vorr $Maj,$Ch @ Maj(a,b,c)
vadd.i64 $h,$T1
vadd.i64 $d,$T1
vadd.i64 $h,$Maj
___
}
sub NEON_16_79() {
my $i=shift;
if ($i&1) { &NEON_00_15($i,@_); return; }
# 2x-vectorized, therefore runs every 2nd round
my @X=map("q$_",(0..7)); # view @X as 128-bit vector
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
my $e=@_[4]; # $e from NEON_00_15
$i /= 2;
$code.=<<___;
vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
veor $s1,$t0
vshr.u64 $t0,$s0,#@sigma0[0]
veor $s1,$t1 @ sigma1(X[i+14])
vshr.u64 $t1,$s0,#@sigma0[1]
vadd.i64 @X[$i%8],$s1
vshr.u64 $s1,$s0,#@sigma0[2]
vsli.64 $t0,$s0,#`64-@sigma0[0]`
vsli.64 $t1,$s0,#`64-@sigma0[1]`
vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
veor $s1,$t0
vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
vadd.i64 @X[$i%8],$s0
vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
veor $s1,$t1 @ sigma0(X[i+1])
vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
vadd.i64 @X[$i%8],$s1
___
&NEON_00_15(2*$i,@_);
}
$code.=<<___;
#if __ARM_ARCH__>=7
.fpu neon
.align 4
.LNEON:
dmb @ errata #451034 on early Cortex A8
vstmdb sp!,{d8-d15} @ ABI specification says so
sub $Ktbl,r3,#672 @ K512
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___
for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
mov $cnt,#4
.L16_79_neon:
subs $cnt,#1
___
for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bne .L16_79_neon
vldmia $ctx,{d24-d31} @ load context to temp
vadd.i64 q8,q12 @ vectorized accumulate
vadd.i64 q9,q13
vadd.i64 q10,q14
vadd.i64 q11,q15
vstmia $ctx,{$A-$H} @ save context
teq $inp,$len
sub $Ktbl,#640 @ rewind K512
bne .Loop_neon
vldmia sp!,{d8-d15} @ epilogue
bx lr
#endif
___
}
$code.=<<___;
.size sha512_block_data_order,.-sha512_block_data_order
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2 .align 2
.comm OPENSSL_armcap,4,4
___ ___
$code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\`([^\`]*)\`/eval $1/gem;

View File

@ -61,19 +61,6 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
int SHA384_Init (SHA512_CTX *c) int SHA384_Init (SHA512_CTX *c)
{ {
#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
/* maintain dword order required by assembler module */
unsigned int *h = (unsigned int *)c->h;
h[0] = 0xcbbb9d5d; h[1] = 0xc1059ed8;
h[2] = 0x629a292a; h[3] = 0x367cd507;
h[4] = 0x9159015a; h[5] = 0x3070dd17;
h[6] = 0x152fecd8; h[7] = 0xf70e5939;
h[8] = 0x67332667; h[9] = 0xffc00b31;
h[10] = 0x8eb44a87; h[11] = 0x68581511;
h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
#else
c->h[0]=U64(0xcbbb9d5dc1059ed8); c->h[0]=U64(0xcbbb9d5dc1059ed8);
c->h[1]=U64(0x629a292a367cd507); c->h[1]=U64(0x629a292a367cd507);
c->h[2]=U64(0x9159015a3070dd17); c->h[2]=U64(0x9159015a3070dd17);
@ -82,7 +69,7 @@ int SHA384_Init (SHA512_CTX *c)
c->h[5]=U64(0x8eb44a8768581511); c->h[5]=U64(0x8eb44a8768581511);
c->h[6]=U64(0xdb0c2e0d64f98fa7); c->h[6]=U64(0xdb0c2e0d64f98fa7);
c->h[7]=U64(0x47b5481dbefa4fa4); c->h[7]=U64(0x47b5481dbefa4fa4);
#endif
c->Nl=0; c->Nh=0; c->Nl=0; c->Nh=0;
c->num=0; c->md_len=SHA384_DIGEST_LENGTH; c->num=0; c->md_len=SHA384_DIGEST_LENGTH;
return 1; return 1;
@ -90,19 +77,6 @@ int SHA384_Init (SHA512_CTX *c)
int SHA512_Init (SHA512_CTX *c) int SHA512_Init (SHA512_CTX *c)
{ {
#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
/* maintain dword order required by assembler module */
unsigned int *h = (unsigned int *)c->h;
h[0] = 0x6a09e667; h[1] = 0xf3bcc908;
h[2] = 0xbb67ae85; h[3] = 0x84caa73b;
h[4] = 0x3c6ef372; h[5] = 0xfe94f82b;
h[6] = 0xa54ff53a; h[7] = 0x5f1d36f1;
h[8] = 0x510e527f; h[9] = 0xade682d1;
h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
h[14] = 0x5be0cd19; h[15] = 0x137e2179;
#else
c->h[0]=U64(0x6a09e667f3bcc908); c->h[0]=U64(0x6a09e667f3bcc908);
c->h[1]=U64(0xbb67ae8584caa73b); c->h[1]=U64(0xbb67ae8584caa73b);
c->h[2]=U64(0x3c6ef372fe94f82b); c->h[2]=U64(0x3c6ef372fe94f82b);
@ -111,7 +85,7 @@ int SHA512_Init (SHA512_CTX *c)
c->h[5]=U64(0x9b05688c2b3e6c1f); c->h[5]=U64(0x9b05688c2b3e6c1f);
c->h[6]=U64(0x1f83d9abfb41bd6b); c->h[6]=U64(0x1f83d9abfb41bd6b);
c->h[7]=U64(0x5be0cd19137e2179); c->h[7]=U64(0x5be0cd19137e2179);
#endif
c->Nl=0; c->Nh=0; c->Nl=0; c->Nh=0;
c->num=0; c->md_len=SHA512_DIGEST_LENGTH; c->num=0; c->md_len=SHA512_DIGEST_LENGTH;
return 1; return 1;
@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
if (md==0) return 0; if (md==0) return 0;
#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
/* recall assembler dword order... */
n = c->md_len;
if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
{
unsigned int *h = (unsigned int *)c->h, t;
for (n/=4;n;n--)
{
t = *(h++);
*(md++) = (unsigned char)(t>>24);
*(md++) = (unsigned char)(t>>16);
*(md++) = (unsigned char)(t>>8);
*(md++) = (unsigned char)(t);
}
}
else return 0;
#else
switch (c->md_len) switch (c->md_len)
{ {
/* Let compiler decide if it's appropriate to unroll... */ /* Let compiler decide if it's appropriate to unroll... */
@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
/* ... as well as make sure md_len is not abused. */ /* ... as well as make sure md_len is not abused. */
default: return 0; default: return 0;
} }
#endif
return 1; return 1;
} }