ARM assembler pack update from HEAD.

This commit is contained in:
Andy Polyakov 2011-11-14 20:58:01 +00:00
parent 781bfdc314
commit 88cb59727c
10 changed files with 1068 additions and 219 deletions

View File

@ -20,13 +20,18 @@
# May 2007. # May 2007.
# #
# private_AES_set_[en|de]crypt_key is added. # AES_set_[en|de]crypt_key is added.
# July 2010. # July 2010.
# #
# Rescheduling for dual-issue pipeline resulted in 12% improvement on # Rescheduling for dual-issue pipeline resulted in 12% improvement on
# Cortex A8 core and ~25 cycles per byte processed with 128-bit key. # Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~21.5 cycles per byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output"; open STDOUT,">$output";
@ -46,6 +51,7 @@ $key="r11";
$rounds="r12"; $rounds="r12";
$code=<<___; $code=<<___;
#include "arm_arch.h"
.text .text
.code 32 .code 32
@ -166,7 +172,7 @@ AES_encrypt:
mov $rounds,r0 @ inp mov $rounds,r0 @ inp
mov $key,r2 mov $key,r2
sub $tbl,r3,#AES_encrypt-AES_Te @ Te sub $tbl,r3,#AES_encrypt-AES_Te @ Te
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner... ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1] ldrb $t2,[$rounds,#1]
@ -195,10 +201,33 @@ AES_encrypt:
orr $s3,$s3,$t1,lsl#8 orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16 orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24 orr $s3,$s3,$t3,lsl#24
#else
ldr $s0,[$rounds,#0]
ldr $s1,[$rounds,#4]
ldr $s2,[$rounds,#8]
ldr $s3,[$rounds,#12]
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
#endif
bl _armv4_AES_encrypt bl _armv4_AES_encrypt
ldr $rounds,[sp],#4 @ pop out ldr $rounds,[sp],#4 @ pop out
#if __ARM_ARCH__>=7
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
str $s0,[$rounds,#0]
str $s1,[$rounds,#4]
str $s2,[$rounds,#8]
str $s3,[$rounds,#12]
#else
mov $t1,$s0,lsr#24 @ write output in endian-neutral mov $t1,$s0,lsr#24 @ write output in endian-neutral
mov $t2,$s0,lsr#16 @ manner... mov $t2,$s0,lsr#16 @ manner...
mov $t3,$s0,lsr#8 mov $t3,$s0,lsr#8
@ -227,11 +256,15 @@ AES_encrypt:
strb $t2,[$rounds,#13] strb $t2,[$rounds,#13]
strb $t3,[$rounds,#14] strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15] strb $s3,[$rounds,#15]
#endif
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4-r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
#endif
.size AES_encrypt,.-AES_encrypt .size AES_encrypt,.-AES_encrypt
.type _armv4_AES_encrypt,%function .type _armv4_AES_encrypt,%function
@ -271,11 +304,11 @@ _armv4_AES_encrypt:
and $i2,lr,$s2,lsr#16 @ i1 and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$t3,$i3,ror#8 eor $t3,$t3,$i3,ror#8
and $i3,lr,$s2 and $i3,lr,$s2
eor $s1,$s1,$t1,ror#24
ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8] ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
eor $s1,$s1,$t1,ror#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
mov $s2,$s2,lsr#24 mov $s2,$s2,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0] ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
eor $s0,$s0,$i1,ror#16 eor $s0,$s0,$i1,ror#16
ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24] ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
@ -284,16 +317,16 @@ _armv4_AES_encrypt:
and $i2,lr,$s3,lsr#8 @ i1 and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$t3,$i3,ror#16 eor $t3,$t3,$i3,ror#16
and $i3,lr,$s3,lsr#16 @ i2 and $i3,lr,$s3,lsr#16 @ i2
eor $s2,$s2,$t2,ror#16
ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0] ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
eor $s2,$s2,$t2,ror#16
ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
mov $s3,$s3,lsr#24 mov $s3,$s3,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16] ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
eor $s0,$s0,$i1,ror#24 eor $s0,$s0,$i1,ror#24
ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s1,$s1,$i2,ror#16
ldr $i1,[$key],#16 ldr $i1,[$key],#16
eor $s1,$s1,$i2,ror#16
ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s2,$s2,$i3,ror#8 eor $s2,$s2,$i3,ror#8
ldr $t1,[$key,#-12] ldr $t1,[$key,#-12]
eor $s3,$s3,$t3,ror#8 eor $s3,$s3,$t3,ror#8
@ -333,11 +366,11 @@ _armv4_AES_encrypt:
and $i2,lr,$s2,lsr#16 @ i1 and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$i3,$t3,lsl#8 eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s2 and $i3,lr,$s2
eor $s1,$t1,$s1,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8] ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
eor $s1,$t1,$s1,lsl#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
mov $s2,$s2,lsr#24 mov $s2,$s2,lsr#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0] ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
eor $s0,$i1,$s0,lsl#8 eor $s0,$i1,$s0,lsl#8
ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24] ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
@ -346,15 +379,15 @@ _armv4_AES_encrypt:
and $i2,lr,$s3,lsr#8 @ i1 and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,lsl#8 eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s3,lsr#16 @ i2 and $i3,lr,$s3,lsr#16 @ i2
eor $s2,$t2,$s2,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0] ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
eor $s2,$t2,$s2,lsl#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
mov $s3,$s3,lsr#24 mov $s3,$s3,lsr#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16] ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
eor $s0,$i1,$s0,lsl#8 eor $s0,$i1,$s0,lsl#8
ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
ldr $i1,[$key,#0] ldr $i1,[$key,#0]
ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
eor $s1,$s1,$i2,lsl#8 eor $s1,$s1,$i2,lsl#8
ldr $t1,[$key,#4] ldr $t1,[$key,#4]
eor $s2,$s2,$i3,lsl#16 eor $s2,$s2,$i3,lsl#16
@ -371,11 +404,11 @@ _armv4_AES_encrypt:
ldr pc,[sp],#4 @ pop and return ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_encrypt,.-_armv4_AES_encrypt .size _armv4_AES_encrypt,.-_armv4_AES_encrypt
.global private_AES_set_encrypt_key .global AES_set_encrypt_key
.type private_AES_set_encrypt_key,%function .type AES_set_encrypt_key,%function
.align 5 .align 5
private_AES_set_encrypt_key: AES_set_encrypt_key:
sub r3,pc,#8 @ private_AES_set_encrypt_key sub r3,pc,#8 @ AES_set_encrypt_key
teq r0,#0 teq r0,#0
moveq r0,#-1 moveq r0,#-1
beq .Labrt beq .Labrt
@ -392,12 +425,13 @@ private_AES_set_encrypt_key:
bne .Labrt bne .Labrt
.Lok: stmdb sp!,{r4-r12,lr} .Lok: stmdb sp!,{r4-r12,lr}
sub $tbl,r3,#private_AES_set_encrypt_key-AES_Te-1024 @ Te4 sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4
mov $rounds,r0 @ inp mov $rounds,r0 @ inp
mov lr,r1 @ bits mov lr,r1 @ bits
mov $key,r2 @ key mov $key,r2 @ key
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner... ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1] ldrb $t2,[$rounds,#1]
@ -430,6 +464,22 @@ private_AES_set_encrypt_key:
orr $s3,$s3,$t3,lsl#24 orr $s3,$s3,$t3,lsl#24
str $s2,[$key,#-8] str $s2,[$key,#-8]
str $s3,[$key,#-4] str $s3,[$key,#-4]
#else
ldr $s0,[$rounds,#0]
ldr $s1,[$rounds,#4]
ldr $s2,[$rounds,#8]
ldr $s3,[$rounds,#12]
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
str $s0,[$key],#16
str $s1,[$key,#-12]
str $s2,[$key,#-8]
str $s3,[$key,#-4]
#endif
teq lr,#128 teq lr,#128
bne .Lnot128 bne .Lnot128
@ -466,6 +516,7 @@ private_AES_set_encrypt_key:
b .Ldone b .Ldone
.Lnot128: .Lnot128:
#if __ARM_ARCH__<7
ldrb $i2,[$rounds,#19] ldrb $i2,[$rounds,#19]
ldrb $t1,[$rounds,#18] ldrb $t1,[$rounds,#18]
ldrb $t2,[$rounds,#17] ldrb $t2,[$rounds,#17]
@ -482,6 +533,16 @@ private_AES_set_encrypt_key:
str $i2,[$key],#8 str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24 orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4] str $i3,[$key,#-4]
#else
ldr $i2,[$rounds,#16]
ldr $i3,[$rounds,#20]
#ifdef __ARMEL__
rev $i2,$i2
rev $i3,$i3
#endif
str $i2,[$key],#8
str $i3,[$key,#-4]
#endif
teq lr,#192 teq lr,#192
bne .Lnot192 bne .Lnot192
@ -526,6 +587,7 @@ private_AES_set_encrypt_key:
b .L192_loop b .L192_loop
.Lnot192: .Lnot192:
#if __ARM_ARCH__<7
ldrb $i2,[$rounds,#27] ldrb $i2,[$rounds,#27]
ldrb $t1,[$rounds,#26] ldrb $t1,[$rounds,#26]
ldrb $t2,[$rounds,#25] ldrb $t2,[$rounds,#25]
@ -542,6 +604,16 @@ private_AES_set_encrypt_key:
str $i2,[$key],#8 str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24 orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4] str $i3,[$key,#-4]
#else
ldr $i2,[$rounds,#24]
ldr $i3,[$rounds,#28]
#ifdef __ARMEL__
rev $i2,$i2
rev $i3,$i3
#endif
str $i2,[$key],#8
str $i3,[$key,#-4]
#endif
mov $rounds,#14 mov $rounds,#14
str $rounds,[$key,#240-32] str $rounds,[$key,#240-32]
@ -606,21 +678,21 @@ private_AES_set_encrypt_key:
.Labrt: tst lr,#1 .Labrt: tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key .size AES_set_encrypt_key,.-AES_set_encrypt_key
.global private_AES_set_decrypt_key .global AES_set_decrypt_key
.type private_AES_set_decrypt_key,%function .type AES_set_decrypt_key,%function
.align 5 .align 5
private_AES_set_decrypt_key: AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr str lr,[sp,#-4]! @ push lr
bl private_AES_set_encrypt_key bl AES_set_encrypt_key
teq r0,#0 teq r0,#0
ldrne lr,[sp],#4 @ pop lr ldrne lr,[sp],#4 @ pop lr
bne .Labrt bne .Labrt
stmdb sp!,{r4-r12} stmdb sp!,{r4-r12}
ldr $rounds,[r2,#240] @ private_AES_set_encrypt_key preserves r2, ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
mov $key,r2 @ which is AES_KEY *key mov $key,r2 @ which is AES_KEY *key
mov $i1,r2 mov $i1,r2
add $i2,r2,$rounds,lsl#4 add $i2,r2,$rounds,lsl#4
@ -692,11 +764,15 @@ $code.=<<___;
bne .Lmix bne .Lmix
mov r0,#0 mov r0,#0
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4-r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key #endif
.size AES_set_decrypt_key,.-AES_set_decrypt_key
.type AES_Td,%object .type AES_Td,%object
.align 5 .align 5
@ -811,7 +887,7 @@ AES_decrypt:
mov $rounds,r0 @ inp mov $rounds,r0 @ inp
mov $key,r2 mov $key,r2
sub $tbl,r3,#AES_decrypt-AES_Td @ Td sub $tbl,r3,#AES_decrypt-AES_Td @ Td
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner... ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1] ldrb $t2,[$rounds,#1]
@ -840,10 +916,33 @@ AES_decrypt:
orr $s3,$s3,$t1,lsl#8 orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16 orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24 orr $s3,$s3,$t3,lsl#24
#else
ldr $s0,[$rounds,#0]
ldr $s1,[$rounds,#4]
ldr $s2,[$rounds,#8]
ldr $s3,[$rounds,#12]
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
#endif
bl _armv4_AES_decrypt bl _armv4_AES_decrypt
ldr $rounds,[sp],#4 @ pop out ldr $rounds,[sp],#4 @ pop out
#if __ARM_ARCH__>=7
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
str $s0,[$rounds,#0]
str $s1,[$rounds,#4]
str $s2,[$rounds,#8]
str $s3,[$rounds,#12]
#else
mov $t1,$s0,lsr#24 @ write output in endian-neutral mov $t1,$s0,lsr#24 @ write output in endian-neutral
mov $t2,$s0,lsr#16 @ manner... mov $t2,$s0,lsr#16 @ manner...
mov $t3,$s0,lsr#8 mov $t3,$s0,lsr#8
@ -872,11 +971,15 @@ AES_decrypt:
strb $t2,[$rounds,#13] strb $t2,[$rounds,#13]
strb $t3,[$rounds,#14] strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15] strb $s3,[$rounds,#15]
#endif
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4-r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
#endif
.size AES_decrypt,.-AES_decrypt .size AES_decrypt,.-AES_decrypt
.type _armv4_AES_decrypt,%function .type _armv4_AES_decrypt,%function
@ -916,11 +1019,11 @@ _armv4_AES_decrypt:
and $i2,lr,$s2 @ i1 and $i2,lr,$s2 @ i1
eor $t3,$i3,$t3,ror#8 eor $t3,$i3,$t3,ror#8
and $i3,lr,$s2,lsr#16 and $i3,lr,$s2,lsr#16
eor $s1,$s1,$t1,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8] ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
eor $s1,$s1,$t1,ror#8
ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
mov $s2,$s2,lsr#24 mov $s2,$s2,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16] ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
eor $s0,$s0,$i1,ror#16 eor $s0,$s0,$i1,ror#16
ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24] ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
@ -929,22 +1032,22 @@ _armv4_AES_decrypt:
and $i2,lr,$s3,lsr#8 @ i1 and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,ror#8 eor $t3,$i3,$t3,ror#8
and $i3,lr,$s3 @ i2 and $i3,lr,$s3 @ i2
eor $s2,$s2,$t2,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16] ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
eor $s2,$s2,$t2,ror#8
ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
mov $s3,$s3,lsr#24 mov $s3,$s3,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0] ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
eor $s0,$s0,$i1,ror#8 eor $s0,$s0,$i1,ror#8
ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s1,$s1,$i2,ror#16
eor $s2,$s2,$i3,ror#24
ldr $i1,[$key],#16 ldr $i1,[$key],#16
eor $s3,$s3,$t3,ror#8 eor $s1,$s1,$i2,ror#16
ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s2,$s2,$i3,ror#24
ldr $t1,[$key,#-12] ldr $t1,[$key,#-12]
ldr $t2,[$key,#-8]
eor $s0,$s0,$i1 eor $s0,$s0,$i1
ldr $t2,[$key,#-8]
eor $s3,$s3,$t3,ror#8
ldr $t3,[$key,#-4] ldr $t3,[$key,#-4]
and $i1,lr,$s0,lsr#16 and $i1,lr,$s0,lsr#16
eor $s1,$s1,$t1 eor $s1,$s1,$t1
@ -985,11 +1088,11 @@ _armv4_AES_decrypt:
and $i1,lr,$s2,lsr#8 @ i0 and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$t2,$i2,lsl#8 eor $t2,$t2,$i2,lsl#8
and $i2,lr,$s2 @ i1 and $i2,lr,$s2 @ i1
eor $t3,$t3,$i3,lsl#8
ldrb $i1,[$tbl,$i1] @ Td4[s2>>8] ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
eor $t3,$t3,$i3,lsl#8
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
and $i3,lr,$s2,lsr#16 and $i3,lr,$s2,lsr#16
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
eor $s0,$s0,$i1,lsl#8 eor $s0,$s0,$i1,lsl#8
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
@ -997,11 +1100,11 @@ _armv4_AES_decrypt:
and $i1,lr,$s3,lsr#16 @ i0 and $i1,lr,$s3,lsr#16 @ i0
eor $s2,$t2,$s2,lsl#16 eor $s2,$t2,$s2,lsl#16
and $i2,lr,$s3,lsr#8 @ i1 and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$t3,$i3,lsl#16
ldrb $i1,[$tbl,$i1] @ Td4[s3>>16] ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
eor $t3,$t3,$i3,lsl#16
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
and $i3,lr,$s3 @ i2 and $i3,lr,$s3 @ i2
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
eor $s0,$s0,$i1,lsl#16 eor $s0,$s0,$i1,lsl#16

51
crypto/arm_arch.h Normal file
View File

@ -0,0 +1,51 @@
#ifndef __ARM_ARCH_H__
#define __ARM_ARCH_H__
#if !defined(__ARM_ARCH__)
# if defined(__CC_ARM)
# define __ARM_ARCH__ __TARGET_ARCH_ARM
# if defined(__BIG_ENDIAN)
# define __ARMEB__
# else
# define __ARMEL__
# endif
# elif defined(__GNUC__)
/*
* Why doesn't gcc define __ARM_ARCH__? Instead it defines
* bunch of below macros. See all_architectires[] table in
* gcc/config/arm/arm.c. On a side note it defines
* __ARMEL__/__ARMEB__ for little-/big-endian.
*/
# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \
defined(__ARM_ARCH_7EM__)
# define __ARM_ARCH__ 7
# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \
defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \
defined(__ARM_ARCH_6T2__)
# define __ARM_ARCH__ 6
# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \
defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \
defined(__ARM_ARCH_5TEJ__)
# define __ARM_ARCH__ 5
# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
# define __ARM_ARCH__ 4
# else
# error "unsupported ARM architecture"
# endif
# endif
#endif
#ifdef OPENSSL_FIPSCANISTER
#include <openssl/fipssyms.h>
#endif
#if !__ASSEMBLER__
extern unsigned int OPENSSL_armcap_P;
#define ARMV7_NEON (1<<0)
#define ARMV7_TICK (1<<1)
#endif
#endif

80
crypto/armcap.c Normal file
View File

@ -0,0 +1,80 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <setjmp.h>
#include <signal.h>
#include <crypto.h>
#include "arm_arch.h"
unsigned int OPENSSL_armcap_P;
static sigset_t all_masked;
static sigjmp_buf ill_jmp;
static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
/*
* Following subroutines could have been inlined, but it's not all
* ARM compilers support inline assembler...
*/
void _armv7_neon_probe(void);
unsigned int _armv7_tick(void);
unsigned int OPENSSL_rdtsc(void)
{
if (OPENSSL_armcap_P|ARMV7_TICK)
return _armv7_tick();
else
return 0;
}
#if defined(__GNUC__) && __GNUC__>=2
void OPENSSL_cpuid_setup(void) __attribute__((constructor));
#endif
void OPENSSL_cpuid_setup(void)
{
char *e;
struct sigaction ill_oact,ill_act;
sigset_t oset;
static int trigger=0;
if (trigger) return;
trigger=1;
if ((e=getenv("OPENSSL_armcap")))
{
OPENSSL_armcap_P=strtoul(e,NULL,0);
return;
}
sigfillset(&all_masked);
sigdelset(&all_masked,SIGILL);
sigdelset(&all_masked,SIGTRAP);
sigdelset(&all_masked,SIGFPE);
sigdelset(&all_masked,SIGBUS);
sigdelset(&all_masked,SIGSEGV);
OPENSSL_armcap_P = 0;
memset(&ill_act,0,sizeof(ill_act));
ill_act.sa_handler = ill_handler;
ill_act.sa_mask = all_masked;
sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
sigaction(SIGILL,&ill_act,&ill_oact);
if (sigsetjmp(ill_jmp,1) == 0)
{
_armv7_neon_probe();
OPENSSL_armcap_P |= ARMV7_NEON;
}
if (sigsetjmp(ill_jmp,1) == 0)
{
_armv7_tick();
OPENSSL_armcap_P |= ARMV7_TICK;
}
sigaction (SIGILL,&ill_oact,NULL);
sigprocmask(SIG_SETMASK,&oset,NULL);
}

154
crypto/armv4cpuid.S Normal file
View File

@ -0,0 +1,154 @@
#include "arm_arch.h"
.text
.code 32
.align 5
.global _armv7_neon_probe
.type _armv7_neon_probe,%function
_armv7_neon_probe:
.word 0xf26ee1fe @ vorr q15,q15,q15
.word 0xe12fff1e @ bx lr
.size _armv7_neon_probe,.-_armv7_neon_probe
.global _armv7_tick
.type _armv7_tick,%function
_armv7_tick:
mrc p15,0,r0,c9,c13,0
.word 0xe12fff1e @ bx lr
.size _armv7_tick,.-_armv7_tick
.global OPENSSL_atomic_add
.type OPENSSL_atomic_add,%function
OPENSSL_atomic_add:
#if __ARM_ARCH__>=6
.Ladd: ldrex r2,[r0]
add r3,r2,r1
strex r2,r3,[r0]
cmp r2,#0
bne .Ladd
mov r0,r3
.word 0xe12fff1e @ bx lr
#else
stmdb sp!,{r4-r6,lr}
ldr r2,.Lspinlock
adr r3,.Lspinlock
mov r4,r0
mov r5,r1
add r6,r3,r2 @ &spinlock
b .+8
.Lspin: bl sched_yield
mov r0,#-1
swp r0,r0,[r6]
cmp r0,#0
bne .Lspin
ldr r2,[r4]
add r2,r2,r5
str r2,[r4]
str r0,[r6] @ release spinlock
ldmia sp!,{r4-r6,lr}
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
#endif
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
.global OPENSSL_cleanse
.type OPENSSL_cleanse,%function
OPENSSL_cleanse:
eor ip,ip,ip
cmp r1,#7
subhs r1,r1,#4
bhs .Lot
cmp r1,#0
beq .Lcleanse_done
.Little:
strb ip,[r0],#1
subs r1,r1,#1
bhi .Little
b .Lcleanse_done
.Lot: tst r0,#3
beq .Laligned
strb ip,[r0],#1
sub r1,r1,#1
b .Lot
.Laligned:
str ip,[r0],#4
subs r1,r1,#4
bhs .Laligned
adds r1,r1,#4
bne .Little
.Lcleanse_done:
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
.size OPENSSL_cleanse,.-OPENSSL_cleanse
.global OPENSSL_wipe_cpu
.type OPENSSL_wipe_cpu,%function
OPENSSL_wipe_cpu:
ldr r0,.LOPENSSL_armcap
adr r1,.LOPENSSL_armcap
ldr r0,[r1,r0]
eor r2,r2,r2
eor r3,r3,r3
eor ip,ip,ip
tst r0,#1
beq .Lwipe_done
.word 0xf3000150 @ veor q0, q0, q0
.word 0xf3022152 @ veor q1, q1, q1
.word 0xf3044154 @ veor q2, q2, q2
.word 0xf3066156 @ veor q3, q3, q3
.word 0xf34001f0 @ veor q8, q8, q8
.word 0xf34221f2 @ veor q9, q9, q9
.word 0xf34441f4 @ veor q10, q10, q10
.word 0xf34661f6 @ veor q11, q11, q11
.word 0xf34881f8 @ veor q12, q12, q12
.word 0xf34aa1fa @ veor q13, q13, q13
.word 0xf34cc1fc @ veor q14, q14, q14
.word 0xf34ee1fe @ veor q15, q15, q15
.Lwipe_done:
mov r0,sp
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
.global OPENSSL_instrument_bus
.type OPENSSL_instrument_bus,%function
OPENSSL_instrument_bus:
eor r0,r0,r0
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
.global OPENSSL_instrument_bus2
.type OPENSSL_instrument_bus2,%function
OPENSSL_instrument_bus2:
eor r0,r0,r0
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.LOPENSSL_armcap
#if __ARM_ARCH__>=6
.align 5
#else
.Lspinlock:
.word atomic_add_spinlock-.Lspinlock
.align 5
.data
.align 2
atomic_add_spinlock:
.word 0
#endif
.comm OPENSSL_armcap_P,4,4
.hidden OPENSSL_armcap_P

278
crypto/bn/asm/armv4-gf2m.pl Normal file
View File

@ -0,0 +1,278 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
# C for the time being... Except that it has two code paths: pure
# integer code suitable for any ARMv4 and later CPU and NEON code
# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
# faster than compiler-generated code. For ECDH and ECDSA verify (but
# not for ECDSA sign) it means 25%-45% improvement depending on key
# length, more for longer keys. Even though NEON 1x1 multiplication
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
$code=<<___;
#include "arm_arch.h"
.text
.code 32
#if __ARM_ARCH__>=7
.fpu neon
.type mul_1x1_neon,%function
.align 5
mul_1x1_neon:
vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
vmull.p8 `&Q("d0")`,d16,d17 @ a·bb
vshl.u64 `&Dlo("q2")`,d16,#16
vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb
vshl.u64 `&Dlo("q3")`,d16,#24
vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb
vshr.u64 `&Dlo("q1")`,#8
vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb
vshl.u64 `&Dhi("q1")`,#24
veor d0,`&Dlo("q1")`
vshr.u64 `&Dlo("q2")`,#16
veor d0,`&Dhi("q1")`
vshl.u64 `&Dhi("q2")`,#16
veor d0,`&Dlo("q2")`
vshr.u64 `&Dlo("q3")`,#24
veor d0,`&Dhi("q2")`
vshl.u64 `&Dhi("q3")`,#8
veor d0,`&Dlo("q3")`
veor d0,`&Dhi("q3")`
bx lr
.size mul_1x1_neon,.-mul_1x1_neon
#endif
___
################
# private interface to mul_1x1_ialu
#
$a="r1";
$b="r0";
($a0,$a1,$a2,$a12,$a4,$a14)=
($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
$mask="r12";
$code.=<<___;
.type mul_1x1_ialu,%function
.align 5
mul_1x1_ialu:
mov $a0,#0
bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
str $a0,[sp,#0] @ tab[0]=0
add $a2,$a1,$a1 @ a2=a1<<1
str $a1,[sp,#4] @ tab[1]=a1
eor $a12,$a1,$a2 @ a1^a2
str $a2,[sp,#8] @ tab[2]=a2
mov $a4,$a1,lsl#2 @ a4=a1<<2
str $a12,[sp,#12] @ tab[3]=a1^a2
eor $a14,$a1,$a4 @ a1^a4
str $a4,[sp,#16] @ tab[4]=a4
eor $a0,$a2,$a4 @ a2^a4
str $a14,[sp,#20] @ tab[5]=a1^a4
eor $a12,$a12,$a4 @ a1^a2^a4
str $a0,[sp,#24] @ tab[6]=a2^a4
and $i0,$mask,$b,lsl#2
str $a12,[sp,#28] @ tab[7]=a1^a2^a4
and $i1,$mask,$b,lsr#1
ldr $lo,[sp,$i0] @ tab[b & 0x7]
and $i0,$mask,$b,lsr#4
ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
and $i1,$mask,$b,lsr#7
ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
eor $lo,$lo,$t1,lsl#3 @ stall
mov $hi,$t1,lsr#29
ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
and $i0,$mask,$b,lsr#10
eor $lo,$lo,$t0,lsl#6
eor $hi,$hi,$t0,lsr#26
ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
and $i1,$mask,$b,lsr#13
eor $lo,$lo,$t1,lsl#9
eor $hi,$hi,$t1,lsr#23
ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
and $i0,$mask,$b,lsr#16
eor $lo,$lo,$t0,lsl#12
eor $hi,$hi,$t0,lsr#20
ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
and $i1,$mask,$b,lsr#19
eor $lo,$lo,$t1,lsl#15
eor $hi,$hi,$t1,lsr#17
ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
and $i0,$mask,$b,lsr#22
eor $lo,$lo,$t0,lsl#18
eor $hi,$hi,$t0,lsr#14
ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
and $i1,$mask,$b,lsr#25
eor $lo,$lo,$t1,lsl#21
eor $hi,$hi,$t1,lsr#11
ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
tst $a,#1<<30
and $i0,$mask,$b,lsr#28
eor $lo,$lo,$t0,lsl#24
eor $hi,$hi,$t0,lsr#8
ldr $t0,[sp,$i0] @ tab[b >> 30 ]
eorne $lo,$lo,$b,lsl#30
eorne $hi,$hi,$b,lsr#2
tst $a,#1<<31
eor $lo,$lo,$t1,lsl#27
eor $hi,$hi,$t1,lsr#5
eorne $lo,$lo,$b,lsl#31
eorne $hi,$hi,$b,lsr#1
eor $lo,$lo,$t0,lsl#30
eor $hi,$hi,$t0,lsr#2
mov pc,lr
.size mul_1x1_ialu,.-mul_1x1_ialu
___
################
# void bn_GF2m_mul_2x2(BN_ULONG *r,
# BN_ULONG a1,BN_ULONG a0,
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
$code.=<<___;
.global bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function
.align 5
bn_GF2m_mul_2x2:
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
.Lpic: ldr r12,[pc,r12]
tst r12,#1
beq .Lialu
veor $A1,$A1
vmov.32 $B1,r3,r3 @ two copies of b1
vmov.32 ${A1}[0],r1 @ a1
veor $A0,$A0
vld1.32 ${B0}[],[sp,:32] @ two copies of b0
vmov.32 ${A0}[0],r2 @ a0
mov r12,lr
vmov d16,$A1
vmov d17,$B1
bl mul_1x1_neon @ a1·b1
vmov $A1B1,d0
vmov d16,$A0
vmov d17,$B0
bl mul_1x1_neon @ a0·b0
vmov $A0B0,d0
veor d16,$A0,$A1
veor d17,$B0,$B1
veor $A0,$A0B0,$A1B1
bl mul_1x1_neon @ (a0+a1)·(b0+b1)
veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
vshl.u64 d1,d0,#32
vshr.u64 d0,d0,#32
veor $A0B0,d1
veor $A1B1,d0
vst1.32 {${A0B0}[0]},[r0,:32]!
vst1.32 {${A0B0}[1]},[r0,:32]!
vst1.32 {${A1B1}[0]},[r0,:32]!
vst1.32 {${A1B1}[1]},[r0,:32]
bx r12
.align 4
.Lialu:
#endif
___
$ret="r10"; # reassigned 1st argument
$code.=<<___;
stmdb sp!,{r4-r10,lr}
mov $ret,r0 @ reassign 1st argument
mov $b,r3 @ $b=b1
ldr r3,[sp,#32] @ load b0
mov $mask,#7<<2
sub sp,sp,#32 @ allocate tab[8]
bl mul_1x1_ialu @ a1·b1
str $lo,[$ret,#8]
str $hi,[$ret,#12]
eor $b,$b,r3 @ flip b0 and b1
eor $a,$a,r2 @ flip a0 and a1
eor r3,r3,$b
eor r2,r2,$a
eor $b,$b,r3
eor $a,$a,r2
bl mul_1x1_ialu @ a0·b0
str $lo,[$ret]
str $hi,[$ret,#4]
eor $a,$a,r2
eor $b,$b,r3
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
___
@r=map("r$_",(6..9));
$code.=<<___;
ldmia $ret,{@r[0]-@r[3]}
eor $lo,$lo,$hi
eor $hi,$hi,@r[1]
eor $lo,$lo,@r[0]
eor $hi,$hi,@r[2]
eor $lo,$lo,@r[3]
eor $hi,$hi,@r[3]
str $hi,[$ret,#8]
eor $lo,$lo,$hi
add sp,sp,#32 @ destroy tab[8]
str $lo,[$ret,#4]
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r10,pc}
#else
ldmia sp!,{r4-r10,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
#if __ARM_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-(.Lpic+8)
#endif
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
.comm OPENSSL_armcap_P,4,4
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
print $code;
close STDOUT; # enforce flush

View File

@ -23,6 +23,9 @@
# than 1/2KB. Windows CE port would be trivial, as it's exclusively # than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical. # about decorations, ABI and instruction syntax are identical.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$num="r0"; # starts as num argument, but holds &tp[num-1] $num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1"; $ap="r1";
$bp="r2"; $bi="r2"; $rp="r2"; $bp="r2"; $bi="r2"; $rp="r2";
@ -89,9 +92,9 @@ bn_mul_mont:
.L1st: .L1st:
ldr $aj,[$ap],#4 @ ap[j],ap++ ldr $aj,[$ap],#4 @ ap[j],ap++
mov $alo,$ahi mov $alo,$ahi
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0 mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
ldr $nj,[$np],#4 @ np[j],np++
mov $nhi,#0 mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adds $nlo,$nlo,$alo adds $nlo,$nlo,$alo
@ -101,21 +104,21 @@ bn_mul_mont:
bne .L1st bne .L1st
adds $nlo,$nlo,$ahi adds $nlo,$nlo,$ahi
mov $nhi,#0
adc $nhi,$nhi,#0
ldr $tp,[$_bp] @ restore bp ldr $tp,[$_bp] @ restore bp
str $nlo,[$num] @ tp[num-1]= mov $nhi,#0
ldr $n0,[$_n0] @ restore n0 ldr $n0,[$_n0] @ restore n0
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]= str $nhi,[$num,#4] @ tp[num]=
.Louter: .Louter:
sub $tj,$num,sp @ "original" $num-1 value sub $tj,$num,sp @ "original" $num-1 value
sub $ap,$ap,$tj @ "rewind" ap to &ap[1] sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $bi,[$tp,#4]! @ *(++bp) ldr $bi,[$tp,#4]! @ *(++bp)
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $aj,[$ap,#-4] @ ap[0] ldr $aj,[$ap,#-4] @ ap[0]
ldr $nj,[$np,#-4] @ np[0]
ldr $alo,[sp] @ tp[0] ldr $alo,[sp] @ tp[0]
ldr $nj,[$np,#-4] @ np[0]
ldr $tj,[sp,#4] @ tp[1] ldr $tj,[sp,#4] @ tp[1]
mov $ahi,#0 mov $ahi,#0
@ -129,13 +132,13 @@ bn_mul_mont:
.Linner: .Linner:
ldr $aj,[$ap],#4 @ ap[j],ap++ ldr $aj,[$ap],#4 @ ap[j],ap++
adds $alo,$ahi,$tj @ +=tp[j] adds $alo,$ahi,$tj @ +=tp[j]
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0 mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
ldr $nj,[$np],#4 @ np[j],np++
mov $nhi,#0 mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
ldr $tj,[$tp,#8] @ tp[j+1]
adc $ahi,$ahi,#0 adc $ahi,$ahi,#0
ldr $tj,[$tp,#8] @ tp[j+1]
adds $nlo,$nlo,$alo adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++ str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0 adc $nlo,$nhi,#0
@ -144,13 +147,13 @@ bn_mul_mont:
adds $nlo,$nlo,$ahi adds $nlo,$nlo,$ahi
mov $nhi,#0 mov $nhi,#0
adc $nhi,$nhi,#0
adds $nlo,$nlo,$tj
adc $nhi,$nhi,#0
ldr $tp,[$_bp] @ restore bp ldr $tp,[$_bp] @ restore bp
ldr $tj,[$_bpend] @ restore &bp[num] adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
ldr $n0,[$_n0] @ restore n0 ldr $n0,[$_n0] @ restore n0
adds $nlo,$nlo,$tj
ldr $tj,[$_bpend] @ restore &bp[num]
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]= str $nhi,[$num,#4] @ tp[num]=
cmp $tp,$tj cmp $tp,$tj

View File

@ -47,6 +47,10 @@
# Cortex A8 core and in absolute terms ~870 cycles per input block # Cortex A8 core and in absolute terms ~870 cycles per input block
# [or 13.6 cycles per byte]. # [or 13.6 cycles per byte].
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output"; open STDOUT,">$output";
@ -76,31 +80,41 @@ $code.=<<___;
add $e,$K,$e,ror#2 @ E+=K_xx_xx add $e,$K,$e,ror#2 @ E+=K_xx_xx
ldr $t3,[$Xi,#2*4] ldr $t3,[$Xi,#2*4]
eor $t0,$t0,$t1 eor $t0,$t0,$t1
eor $t2,$t2,$t3 eor $t2,$t2,$t3 @ 1 cycle stall
eor $t1,$c,$d @ F_xx_xx eor $t1,$c,$d @ F_xx_xx
mov $t0,$t0,ror#31 mov $t0,$t0,ror#31
add $e,$e,$a,ror#27 @ E+=ROR(A,27) add $e,$e,$a,ror#27 @ E+=ROR(A,27)
eor $t0,$t0,$t2,ror#31 eor $t0,$t0,$t2,ror#31
str $t0,[$Xi,#-4]!
$opt1 @ F_xx_xx $opt1 @ F_xx_xx
$opt2 @ F_xx_xx $opt2 @ F_xx_xx
add $e,$e,$t0 @ E+=X[i] add $e,$e,$t0 @ E+=X[i]
str $t0,[$Xi,#-4]!
___ ___
} }
sub BODY_00_15 { sub BODY_00_15 {
my ($a,$b,$c,$d,$e)=@_; my ($a,$b,$c,$d,$e)=@_;
$code.=<<___; $code.=<<___;
ldrb $t0,[$inp],#4 #if __ARM_ARCH__<7
ldrb $t1,[$inp,#-1] ldrb $t1,[$inp,#2]
ldrb $t2,[$inp,#-2] ldrb $t0,[$inp,#3]
ldrb $t2,[$inp,#1]
add $e,$K,$e,ror#2 @ E+=K_00_19 add $e,$K,$e,ror#2 @ E+=K_00_19
ldrb $t3,[$inp,#-3] ldrb $t3,[$inp],#4
add $e,$e,$a,ror#27 @ E+=ROR(A,27) orr $t0,$t0,$t1,lsl#8
orr $t0,$t1,$t0,lsl#24
eor $t1,$c,$d @ F_xx_xx eor $t1,$c,$d @ F_xx_xx
orr $t0,$t0,$t2,lsl#8 orr $t0,$t0,$t2,lsl#16
orr $t0,$t0,$t3,lsl#16 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
orr $t0,$t0,$t3,lsl#24
#else
ldr $t0,[$inp],#4 @ handles unaligned
add $e,$K,$e,ror#2 @ E+=K_00_19
eor $t1,$c,$d @ F_xx_xx
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev $t0,$t0 @ byte swap
#endif
#endif
and $t1,$b,$t1,ror#2 and $t1,$b,$t1,ror#2
add $e,$e,$t0 @ E+=X[i] add $e,$e,$t0 @ E+=X[i]
eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
@ -136,6 +150,8 @@ ___
} }
$code=<<___; $code=<<___;
#include "arm_arch.h"
.text .text
.global sha1_block_data_order .global sha1_block_data_order
@ -209,10 +225,14 @@ $code.=<<___;
teq $inp,$len teq $inp,$len
bne .Lloop @ [+18], total 1307 bne .Lloop @ [+18], total 1307
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4-r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
#endif
.align 2 .align 2
.LK_00_19: .word 0x5a827999 .LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1 .LK_20_39: .word 0x6ed9eba1

View File

@ -18,11 +18,16 @@
# Rescheduling for dual-issue pipeline resulted in 22% improvement on # Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte. # Cortex A8 core and ~20 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~17 cycles per processed byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output"; open STDOUT,">$output";
$ctx="r0"; $t0="r0"; $ctx="r0"; $t0="r0";
$inp="r1"; $inp="r1"; $t3="r1";
$len="r2"; $t1="r2"; $len="r2"; $t1="r2";
$T1="r3"; $T1="r3";
$A="r4"; $A="r4";
@ -46,6 +51,9 @@ sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16); $code.=<<___ if ($i<16);
#if __ARM_ARCH__>=7
ldr $T1,[$inp],#4
#else
ldrb $T1,[$inp,#3] @ $i ldrb $T1,[$inp,#3] @ $i
ldrb $t2,[$inp,#2] ldrb $t2,[$inp,#2]
ldrb $t1,[$inp,#1] ldrb $t1,[$inp,#1]
@ -53,16 +61,24 @@ $code.=<<___ if ($i<16);
orr $T1,$T1,$t2,lsl#8 orr $T1,$T1,$t2,lsl#8
orr $T1,$T1,$t1,lsl#16 orr $T1,$T1,$t1,lsl#16
orr $T1,$T1,$t0,lsl#24 orr $T1,$T1,$t0,lsl#24
`"str $inp,[sp,#17*4]" if ($i==15)` #endif
___ ___
$code.=<<___; $code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++
mov $t0,$e,ror#$Sigma1[0] mov $t0,$e,ror#$Sigma1[0]
str $T1,[sp,#`$i%16`*4] ldr $t2,[$Ktbl],#4 @ *K256++
eor $t0,$t0,$e,ror#$Sigma1[1] eor $t0,$t0,$e,ror#$Sigma1[1]
eor $t1,$f,$g eor $t1,$f,$g
#if $i>=16
add $T1,$T1,$t3 @ from BODY_16_xx
#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
rev $T1,$T1
#endif
#if $i==15
str $inp,[sp,#17*4] @ leave room for $t3
#endif
eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
and $t1,$t1,$e and $t1,$t1,$e
str $T1,[sp,#`$i%16`*4]
add $T1,$T1,$t0 add $T1,$T1,$t0
eor $t1,$t1,$g @ Ch(e,f,g) eor $t1,$t1,$g @ Ch(e,f,g)
add $T1,$T1,$h add $T1,$T1,$h
@ -71,6 +87,9 @@ $code.=<<___;
eor $h,$h,$a,ror#$Sigma0[1] eor $h,$h,$a,ror#$Sigma0[1]
add $T1,$T1,$t2 add $T1,$T1,$t2
eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
#if $i>=15
ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
#endif
orr $t0,$a,$b orr $t0,$a,$b
and $t1,$a,$b and $t1,$a,$b
and $t0,$t0,$c and $t0,$t0,$c
@ -85,24 +104,26 @@ sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___; $code.=<<___;
ldr $t1,[sp,#`($i+1)%16`*4] @ $i @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
ldr $t2,[sp,#`($i+14)%16`*4] ldr $t2,[sp,#`($i+14)%16`*4]
mov $t0,$t3,ror#$sigma0[0]
ldr $T1,[sp,#`($i+0)%16`*4] ldr $T1,[sp,#`($i+0)%16`*4]
mov $t0,$t1,ror#$sigma0[0] eor $t0,$t0,$t3,ror#$sigma0[1]
ldr $inp,[sp,#`($i+9)%16`*4] ldr $t1,[sp,#`($i+9)%16`*4]
eor $t0,$t0,$t1,ror#$sigma0[1] eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) mov $t3,$t2,ror#$sigma1[0]
mov $t1,$t2,ror#$sigma1[0]
add $T1,$T1,$t0 add $T1,$T1,$t0
eor $t1,$t1,$t2,ror#$sigma1[1] eor $t3,$t3,$t2,ror#$sigma1[1]
add $T1,$T1,$inp
eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
add $T1,$T1,$t1 add $T1,$T1,$t1
eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
@ add $T1,$T1,$t3
___ ___
&BODY_00_15(@_); &BODY_00_15(@_);
} }
$code=<<___; $code=<<___;
#include "arm_arch.h"
.text .text
.code 32 .code 32
@ -132,7 +153,7 @@ K256:
sha256_block_data_order: sha256_block_data_order:
sub r3,pc,#8 @ sha256_block_data_order sub r3,pc,#8 @ sha256_block_data_order
add $len,$inp,$len,lsl#6 @ len to point at the end of inp add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r12,lr} stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
sub $Ktbl,r3,#256 @ K256 sub $Ktbl,r3,#256 @ K256
sub sp,sp,#16*4 @ alloca(X[16]) sub sp,sp,#16*4 @ alloca(X[16])
@ -171,10 +192,14 @@ $code.=<<___;
bne .Loop bne .Loop
add sp,sp,#`16+3`*4 @ destroy frame add sp,sp,#`16+3`*4 @ destroy frame
ldmia sp!,{r4-r12,lr} #if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha256_block_data_order,.-sha256_block_data_order .size sha256_block_data_order,.-sha256_block_data_order
.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align 2 .align 2

View File

@ -18,22 +18,33 @@
# Rescheduling for dual-issue pipeline resulted in 6% improvement on # Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte. # Cortex A8 core and ~40 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Coxtex A8 core and ~38 cycles per byte.
# March 2011.
#
# Add NEON implementation. On Cortex A8 it was measured to process
# one byte in 25.5 cycles or 47% faster than integer-only code.
# Byte order [in]dependence. ========================================= # Byte order [in]dependence. =========================================
# #
# Caller is expected to maintain specific *dword* order in h[0-7], # Originally caller was expected to maintain specific *dword* order in
# namely with most significant dword at *lower* address, which is # h[0-7], namely with most significant dword at *lower* address, which
# reflected in below two parameters. *Byte* order within these dwords # was reflected in below two parameters as 0 and 4. Now caller is
# in turn is whatever *native* byte order on current platform. # expected to maintain native byte order for whole 64-bit values.
$hi=0; $hi="HI";
$lo=4; $lo="LO";
# ==================================================================== # ====================================================================
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output"; open STDOUT,">$output";
$ctx="r0"; $ctx="r0"; # parameter block
$inp="r1"; $inp="r1";
$len="r2"; $len="r2";
$Tlo="r3"; $Tlo="r3";
$Thi="r4"; $Thi="r4";
$Alo="r5"; $Alo="r5";
@ -61,15 +72,17 @@ $Xoff=8*8;
sub BODY_00_15() { sub BODY_00_15() {
my $magic = shift; my $magic = shift;
$code.=<<___; $code.=<<___;
ldr $t2,[sp,#$Hoff+0] @ h.lo
ldr $t3,[sp,#$Hoff+4] @ h.hi
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
mov $t0,$Elo,lsr#14 mov $t0,$Elo,lsr#14
str $Tlo,[sp,#$Xoff+0]
mov $t1,$Ehi,lsr#14 mov $t1,$Ehi,lsr#14
str $Thi,[sp,#$Xoff+4]
eor $t0,$t0,$Ehi,lsl#18 eor $t0,$t0,$Ehi,lsl#18
ldr $t2,[sp,#$Hoff+0] @ h.lo
eor $t1,$t1,$Elo,lsl#18 eor $t1,$t1,$Elo,lsl#18
ldr $t3,[sp,#$Hoff+4] @ h.hi
eor $t0,$t0,$Elo,lsr#18 eor $t0,$t0,$Elo,lsr#18
eor $t1,$t1,$Ehi,lsr#18 eor $t1,$t1,$Ehi,lsr#18
eor $t0,$t0,$Ehi,lsl#14 eor $t0,$t0,$Ehi,lsl#14
@ -96,25 +109,24 @@ $code.=<<___;
and $t1,$t1,$Ehi and $t1,$t1,$Ehi
str $Ahi,[sp,#$Aoff+4] str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2 eor $t0,$t0,$t2
ldr $t2,[$Ktbl,#4] @ K[i].lo ldr $t2,[$Ktbl,#$lo] @ K[i].lo
eor $t1,$t1,$t3 @ Ch(e,f,g) eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#0] @ K[i].hi ldr $t3,[$Ktbl,#$hi] @ K[i].hi
adds $Tlo,$Tlo,$t0 adds $Tlo,$Tlo,$t0
ldr $Elo,[sp,#$Doff+0] @ d.lo ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
ldr $Ehi,[sp,#$Doff+4] @ d.hi ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2 adds $Tlo,$Tlo,$t2
and $t0,$t2,#0xff
adc $Thi,$Thi,$t3 @ T += K[i] adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo adds $Elo,$Elo,$Tlo
adc $Ehi,$Ehi,$Thi @ d += T
and $t0,$t2,#0xff
teq $t0,#$magic
orreq $Ktbl,$Ktbl,#1
ldr $t2,[sp,#$Boff+0] @ b.lo ldr $t2,[sp,#$Boff+0] @ b.lo
adc $Ehi,$Ehi,$Thi @ d += T
teq $t0,#$magic
ldr $t3,[sp,#$Coff+0] @ c.lo ldr $t3,[sp,#$Coff+0] @ c.lo
orreq $Ktbl,$Ktbl,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@ -131,80 +143,100 @@ $code.=<<___;
eor $t0,$t0,$Alo,lsl#25 eor $t0,$t0,$Alo,lsl#25
eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
adds $Tlo,$Tlo,$t0 adds $Tlo,$Tlo,$t0
and $t0,$Alo,$t2
adc $Thi,$Thi,$t1 @ T += Sigma0(a) adc $Thi,$Thi,$t1 @ T += Sigma0(a)
and $t0,$Alo,$t2
orr $Alo,$Alo,$t2
ldr $t1,[sp,#$Boff+4] @ b.hi ldr $t1,[sp,#$Boff+4] @ b.hi
orr $Alo,$Alo,$t2
ldr $t2,[sp,#$Coff+4] @ c.hi ldr $t2,[sp,#$Coff+4] @ c.hi
and $Alo,$Alo,$t3 and $Alo,$Alo,$t3
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $t3,$Ahi,$t1 and $t3,$Ahi,$t1
orr $Ahi,$Ahi,$t1 orr $Ahi,$Ahi,$t1
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $Ahi,$Ahi,$t2 and $Ahi,$Ahi,$t2
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
adds $Alo,$Alo,$Tlo adds $Alo,$Alo,$Tlo
adc $Ahi,$Ahi,$Thi @ h += T orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
sub sp,sp,#8 sub sp,sp,#8
adc $Ahi,$Ahi,$Thi @ h += T
tst $Ktbl,#1
add $Ktbl,$Ktbl,#8 add $Ktbl,$Ktbl,#8
___ ___
} }
$code=<<___; $code=<<___;
#include "arm_arch.h"
#ifdef __ARMEL__
# define LO 0
# define HI 4
# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
#else
# define HI 0
# define LO 4
# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
#endif
.text .text
.code 32 .code 32
.type K512,%object .type K512,%object
.align 5 .align 5
K512: K512:
.word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
.word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
.word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
.word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
.word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
.word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
.word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
.word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
.word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
.word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
.word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
.word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
.word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
.word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
.word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
.word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
.word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
.word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
.word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
.word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
.word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
.word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
.word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
.word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
.word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
.word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
.word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
.word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
.word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
.word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
.word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
.word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
.word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
.word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
.word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
.word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
.word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
.word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
.word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
.word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512 .size K512,.-K512
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha512_block_data_order
.skip 32-4
.global sha512_block_data_order .global sha512_block_data_order
.type sha512_block_data_order,%function .type sha512_block_data_order,%function
sha512_block_data_order: sha512_block_data_order:
sub r3,pc,#8 @ sha512_block_data_order sub r3,pc,#8 @ sha512_block_data_order
add $len,$inp,$len,lsl#7 @ len to point at the end of inp add $len,$inp,$len,lsl#7 @ len to point at the end of inp
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#1
bne .LNEON
#endif
stmdb sp!,{r4-r12,lr} stmdb sp!,{r4-r12,lr}
sub $Ktbl,r3,#640 @ K512 sub $Ktbl,r3,#672 @ K512
sub sp,sp,#9*8 sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo] ldr $Elo,[$ctx,#$Eoff+$lo]
@ -238,6 +270,7 @@ sha512_block_data_order:
str $Thi,[sp,#$Foff+4] str $Thi,[sp,#$Foff+4]
.L00_15: .L00_15:
#if __ARM_ARCH__<7
ldrb $Tlo,[$inp,#7] ldrb $Tlo,[$inp,#7]
ldrb $t0, [$inp,#6] ldrb $t0, [$inp,#6]
ldrb $t1, [$inp,#5] ldrb $t1, [$inp,#5]
@ -252,26 +285,30 @@ sha512_block_data_order:
orr $Thi,$Thi,$t3,lsl#8 orr $Thi,$Thi,$t3,lsl#8
orr $Thi,$Thi,$t0,lsl#16 orr $Thi,$Thi,$t0,lsl#16
orr $Thi,$Thi,$t1,lsl#24 orr $Thi,$Thi,$t1,lsl#24
str $Tlo,[sp,#$Xoff+0] #else
str $Thi,[sp,#$Xoff+4] ldr $Tlo,[$inp,#4]
ldr $Thi,[$inp],#8
#ifdef __ARMEL__
rev $Tlo,$Tlo
rev $Thi,$Thi
#endif
#endif
___ ___
&BODY_00_15(0x94); &BODY_00_15(0x94);
$code.=<<___; $code.=<<___;
tst $Ktbl,#1 tst $Ktbl,#1
beq .L00_15 beq .L00_15
bic $Ktbl,$Ktbl,#1
.L16_79:
ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] bic $Ktbl,$Ktbl,#1
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] .L16_79:
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
mov $Tlo,$t0,lsr#1 mov $Tlo,$t0,lsr#1
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
mov $Thi,$t1,lsr#1 mov $Thi,$t1,lsr#1
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
eor $Tlo,$Tlo,$t1,lsl#31 eor $Tlo,$Tlo,$t1,lsl#31
eor $Thi,$Thi,$t0,lsl#31 eor $Thi,$Thi,$t0,lsl#31
eor $Tlo,$Tlo,$t0,lsr#8 eor $Tlo,$Tlo,$t0,lsr#8
@ -295,25 +332,24 @@ $code.=<<___;
eor $t1,$t1,$t3,lsl#3 eor $t1,$t1,$t3,lsl#3
eor $t0,$t0,$t2,lsr#6 eor $t0,$t0,$t2,lsr#6
eor $t1,$t1,$t3,lsr#6 eor $t1,$t1,$t3,lsr#6
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
eor $t0,$t0,$t3,lsl#26 eor $t0,$t0,$t3,lsl#26
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
adds $Tlo,$Tlo,$t0 adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#`$Xoff+8*16`+0]
adc $Thi,$Thi,$t1 adc $Thi,$Thi,$t1
ldr $t0,[sp,#`$Xoff+8*16`+0]
ldr $t1,[sp,#`$Xoff+8*16`+4] ldr $t1,[sp,#`$Xoff+8*16`+4]
adds $Tlo,$Tlo,$t2 adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3 adc $Thi,$Thi,$t3
adds $Tlo,$Tlo,$t0 adds $Tlo,$Tlo,$t0
adc $Thi,$Thi,$t1 adc $Thi,$Thi,$t1
str $Tlo,[sp,#$Xoff+0]
str $Thi,[sp,#$Xoff+4]
___ ___
&BODY_00_15(0x17); &BODY_00_15(0x17);
$code.=<<___; $code.=<<___;
tst $Ktbl,#1 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
beq .L16_79 beq .L16_79
bic $Ktbl,$Ktbl,#1 bic $Ktbl,$Ktbl,#1
@ -324,12 +360,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Boff+$lo] ldr $t2, [$ctx,#$Boff+$lo]
ldr $t3, [$ctx,#$Boff+$hi] ldr $t3, [$ctx,#$Boff+$hi]
adds $t0,$Alo,$t0 adds $t0,$Alo,$t0
adc $t1,$Ahi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $t0, [$ctx,#$Aoff+$lo] str $t0, [$ctx,#$Aoff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Aoff+$hi] str $t1, [$ctx,#$Aoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Boff+$lo] str $t2, [$ctx,#$Boff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Boff+$hi] str $t3, [$ctx,#$Boff+$hi]
ldr $Alo,[sp,#$Coff+0] ldr $Alo,[sp,#$Coff+0]
@ -341,12 +377,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Doff+$lo] ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi] ldr $t3, [$ctx,#$Doff+$hi]
adds $t0,$Alo,$t0 adds $t0,$Alo,$t0
adc $t1,$Ahi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $t0, [$ctx,#$Coff+$lo] str $t0, [$ctx,#$Coff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Coff+$hi] str $t1, [$ctx,#$Coff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Doff+$lo] str $t2, [$ctx,#$Doff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Doff+$hi] str $t3, [$ctx,#$Doff+$hi]
ldr $Tlo,[sp,#$Foff+0] ldr $Tlo,[sp,#$Foff+0]
@ -356,12 +392,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Foff+$lo] ldr $t2, [$ctx,#$Foff+$lo]
ldr $t3, [$ctx,#$Foff+$hi] ldr $t3, [$ctx,#$Foff+$hi]
adds $Elo,$Elo,$t0 adds $Elo,$Elo,$t0
adc $Ehi,$Ehi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $Elo,[$ctx,#$Eoff+$lo] str $Elo,[$ctx,#$Eoff+$lo]
adc $Ehi,$Ehi,$t1
str $Ehi,[$ctx,#$Eoff+$hi] str $Ehi,[$ctx,#$Eoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Foff+$lo] str $t2, [$ctx,#$Foff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Foff+$hi] str $t3, [$ctx,#$Foff+$hi]
ldr $Alo,[sp,#$Goff+0] ldr $Alo,[sp,#$Goff+0]
@ -373,12 +409,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Hoff+$lo] ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi] ldr $t3, [$ctx,#$Hoff+$hi]
adds $t0,$Alo,$t0 adds $t0,$Alo,$t0
adc $t1,$Ahi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $t0, [$ctx,#$Goff+$lo] str $t0, [$ctx,#$Goff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Goff+$hi] str $t1, [$ctx,#$Goff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Hoff+$lo] str $t2, [$ctx,#$Hoff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Hoff+$hi] str $t3, [$ctx,#$Hoff+$hi]
add sp,sp,#640 add sp,sp,#640
@ -388,13 +424,156 @@ $code.=<<___;
bne .Loop bne .Loop
add sp,sp,#8*9 @ destroy frame add sp,sp,#8*9 @ destroy frame
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr} ldmia sp!,{r4-r12,lr}
tst lr,#1 tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-) bx lr @ interoperable with Thumb ISA:-)
.size sha512_block_data_order,.-sha512_block_data_order #endif
.asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" ___
{
my @Sigma0=(28,34,39);
my @Sigma1=(14,18,41);
my @sigma0=(1, 8, 7);
my @sigma1=(19,61,6);
my $Ktbl="r3";
my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
my @X=map("d$_",(0..15));
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
sub NEON_00_15() {
my $i=shift;
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
$code.=<<___ if ($i<16 || $i&1);
vshr.u64 $t0,$e,#@Sigma1[0] @ $i
#if $i<16
vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
#endif
vshr.u64 $t1,$e,#@Sigma1[1]
vshr.u64 $t2,$e,#@Sigma1[2]
___
$code.=<<___;
vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
vsli.64 $t0,$e,#`64-@Sigma1[0]`
vsli.64 $t1,$e,#`64-@Sigma1[1]`
vsli.64 $t2,$e,#`64-@Sigma1[2]`
#if $i<16 && defined(__ARMEL__)
vrev64.8 @X[$i],@X[$i]
#endif
vadd.i64 $T1,$K,$h
veor $Ch,$f,$g
veor $t0,$t1
vand $Ch,$e
veor $t0,$t2 @ Sigma1(e)
veor $Ch,$g @ Ch(e,f,g)
vadd.i64 $T1,$t0
vshr.u64 $t0,$a,#@Sigma0[0]
vadd.i64 $T1,$Ch
vshr.u64 $t1,$a,#@Sigma0[1]
vshr.u64 $t2,$a,#@Sigma0[2]
vsli.64 $t0,$a,#`64-@Sigma0[0]`
vsli.64 $t1,$a,#`64-@Sigma0[1]`
vsli.64 $t2,$a,#`64-@Sigma0[2]`
vadd.i64 $T1,@X[$i%16]
vorr $Maj,$a,$c
vand $Ch,$a,$c
veor $h,$t0,$t1
vand $Maj,$b
veor $h,$t2 @ Sigma0(a)
vorr $Maj,$Ch @ Maj(a,b,c)
vadd.i64 $h,$T1
vadd.i64 $d,$T1
vadd.i64 $h,$Maj
___
}
sub NEON_16_79() {
my $i=shift;
if ($i&1) { &NEON_00_15($i,@_); return; }
# 2x-vectorized, therefore runs every 2nd round
my @X=map("q$_",(0..7)); # view @X as 128-bit vector
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
my $e=@_[4]; # $e from NEON_00_15
$i /= 2;
$code.=<<___;
vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
veor $s1,$t0
vshr.u64 $t0,$s0,#@sigma0[0]
veor $s1,$t1 @ sigma1(X[i+14])
vshr.u64 $t1,$s0,#@sigma0[1]
vadd.i64 @X[$i%8],$s1
vshr.u64 $s1,$s0,#@sigma0[2]
vsli.64 $t0,$s0,#`64-@sigma0[0]`
vsli.64 $t1,$s0,#`64-@sigma0[1]`
vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
veor $s1,$t0
vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
vadd.i64 @X[$i%8],$s0
vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
veor $s1,$t1 @ sigma0(X[i+1])
vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
vadd.i64 @X[$i%8],$s1
___
&NEON_00_15(2*$i,@_);
}
$code.=<<___;
#if __ARM_ARCH__>=7
.fpu neon
.align 4
.LNEON:
dmb @ errata #451034 on early Cortex A8
vstmdb sp!,{d8-d15} @ ABI specification says so
sub $Ktbl,r3,#672 @ K512
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___
for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
mov $cnt,#4
.L16_79_neon:
subs $cnt,#1
___
for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bne .L16_79_neon
vldmia $ctx,{d24-d31} @ load context to temp
vadd.i64 q8,q12 @ vectorized accumulate
vadd.i64 q9,q13
vadd.i64 q10,q14
vadd.i64 q11,q15
vstmia $ctx,{$A-$H} @ save context
teq $inp,$len
sub $Ktbl,#640 @ rewind K512
bne .Loop_neon
vldmia sp!,{d8-d15} @ epilogue
bx lr
#endif
___
}
$code.=<<___;
.size sha512_block_data_order,.-sha512_block_data_order
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2 .align 2
.comm OPENSSL_armcap_P,4,4
___ ___
$code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\`([^\`]*)\`/eval $1/gem;

View File

@ -61,19 +61,6 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
fips_md_init_ctx(SHA384, SHA512) fips_md_init_ctx(SHA384, SHA512)
{ {
#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
/* maintain dword order required by assembler module */
unsigned int *h = (unsigned int *)c->h;
h[0] = 0xcbbb9d5d; h[1] = 0xc1059ed8;
h[2] = 0x629a292a; h[3] = 0x367cd507;
h[4] = 0x9159015a; h[5] = 0x3070dd17;
h[6] = 0x152fecd8; h[7] = 0xf70e5939;
h[8] = 0x67332667; h[9] = 0xffc00b31;
h[10] = 0x8eb44a87; h[11] = 0x68581511;
h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
#else
c->h[0]=U64(0xcbbb9d5dc1059ed8); c->h[0]=U64(0xcbbb9d5dc1059ed8);
c->h[1]=U64(0x629a292a367cd507); c->h[1]=U64(0x629a292a367cd507);
c->h[2]=U64(0x9159015a3070dd17); c->h[2]=U64(0x9159015a3070dd17);
@ -82,7 +69,7 @@ fips_md_init_ctx(SHA384, SHA512)
c->h[5]=U64(0x8eb44a8768581511); c->h[5]=U64(0x8eb44a8768581511);
c->h[6]=U64(0xdb0c2e0d64f98fa7); c->h[6]=U64(0xdb0c2e0d64f98fa7);
c->h[7]=U64(0x47b5481dbefa4fa4); c->h[7]=U64(0x47b5481dbefa4fa4);
#endif
c->Nl=0; c->Nh=0; c->Nl=0; c->Nh=0;
c->num=0; c->md_len=SHA384_DIGEST_LENGTH; c->num=0; c->md_len=SHA384_DIGEST_LENGTH;
return 1; return 1;
@ -90,19 +77,6 @@ fips_md_init_ctx(SHA384, SHA512)
fips_md_init(SHA512) fips_md_init(SHA512)
{ {
#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
/* maintain dword order required by assembler module */
unsigned int *h = (unsigned int *)c->h;
h[0] = 0x6a09e667; h[1] = 0xf3bcc908;
h[2] = 0xbb67ae85; h[3] = 0x84caa73b;
h[4] = 0x3c6ef372; h[5] = 0xfe94f82b;
h[6] = 0xa54ff53a; h[7] = 0x5f1d36f1;
h[8] = 0x510e527f; h[9] = 0xade682d1;
h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
h[14] = 0x5be0cd19; h[15] = 0x137e2179;
#else
c->h[0]=U64(0x6a09e667f3bcc908); c->h[0]=U64(0x6a09e667f3bcc908);
c->h[1]=U64(0xbb67ae8584caa73b); c->h[1]=U64(0xbb67ae8584caa73b);
c->h[2]=U64(0x3c6ef372fe94f82b); c->h[2]=U64(0x3c6ef372fe94f82b);
@ -111,7 +85,7 @@ fips_md_init(SHA512)
c->h[5]=U64(0x9b05688c2b3e6c1f); c->h[5]=U64(0x9b05688c2b3e6c1f);
c->h[6]=U64(0x1f83d9abfb41bd6b); c->h[6]=U64(0x1f83d9abfb41bd6b);
c->h[7]=U64(0x5be0cd19137e2179); c->h[7]=U64(0x5be0cd19137e2179);
#endif
c->Nl=0; c->Nh=0; c->Nl=0; c->Nh=0;
c->num=0; c->md_len=SHA512_DIGEST_LENGTH; c->num=0; c->md_len=SHA512_DIGEST_LENGTH;
return 1; return 1;
@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
if (md==0) return 0; if (md==0) return 0;
#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
/* recall assembler dword order... */
n = c->md_len;
if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
{
unsigned int *h = (unsigned int *)c->h, t;
for (n/=4;n;n--)
{
t = *(h++);
*(md++) = (unsigned char)(t>>24);
*(md++) = (unsigned char)(t>>16);
*(md++) = (unsigned char)(t>>8);
*(md++) = (unsigned char)(t);
}
}
else return 0;
#else
switch (c->md_len) switch (c->md_len)
{ {
/* Let compiler decide if it's appropriate to unroll... */ /* Let compiler decide if it's appropriate to unroll... */
@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
/* ... as well as make sure md_len is not abused. */ /* ... as well as make sure md_len is not abused. */
default: return 0; default: return 0;
} }
#endif
return 1; return 1;
} }