ARM assembler pack update from HEAD.

This commit is contained in:
Andy Polyakov 2011-11-14 20:58:01 +00:00
parent 781bfdc314
commit 88cb59727c
10 changed files with 1068 additions and 219 deletions

View File

@ -20,13 +20,18 @@
# May 2007.
#
# private_AES_set_[en|de]crypt_key is added.
# AES_set_[en|de]crypt_key is added.
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 12% improvement on
# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~21.5 cycles per byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@ -46,6 +51,7 @@ $key="r11";
$rounds="r12";
$code=<<___;
#include "arm_arch.h"
.text
.code 32
@ -166,7 +172,7 @@ AES_encrypt:
mov $rounds,r0 @ inp
mov $key,r2
sub $tbl,r3,#AES_encrypt-AES_Te @ Te
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1]
@ -195,10 +201,33 @@ AES_encrypt:
orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24
#else
ldr $s0,[$rounds,#0]
ldr $s1,[$rounds,#4]
ldr $s2,[$rounds,#8]
ldr $s3,[$rounds,#12]
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
#endif
bl _armv4_AES_encrypt
ldr $rounds,[sp],#4 @ pop out
#if __ARM_ARCH__>=7
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
str $s0,[$rounds,#0]
str $s1,[$rounds,#4]
str $s2,[$rounds,#8]
str $s3,[$rounds,#12]
#else
mov $t1,$s0,lsr#24 @ write output in endian-neutral
mov $t2,$s0,lsr#16 @ manner...
mov $t3,$s0,lsr#8
@ -227,11 +256,15 @@ AES_encrypt:
strb $t2,[$rounds,#13]
strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15]
#endif
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size AES_encrypt,.-AES_encrypt
.type _armv4_AES_encrypt,%function
@ -271,11 +304,11 @@ _armv4_AES_encrypt:
and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$t3,$i3,ror#8
and $i3,lr,$s2
eor $s1,$s1,$t1,ror#24
ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
eor $s1,$s1,$t1,ror#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
mov $s2,$s2,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
eor $s0,$s0,$i1,ror#16
ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
@ -284,16 +317,16 @@ _armv4_AES_encrypt:
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$t3,$i3,ror#16
and $i3,lr,$s3,lsr#16 @ i2
eor $s2,$s2,$t2,ror#16
ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
eor $s2,$s2,$t2,ror#16
ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
mov $s3,$s3,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
eor $s0,$s0,$i1,ror#24
ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s1,$s1,$i2,ror#16
ldr $i1,[$key],#16
eor $s1,$s1,$i2,ror#16
ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s2,$s2,$i3,ror#8
ldr $t1,[$key,#-12]
eor $s3,$s3,$t3,ror#8
@ -333,11 +366,11 @@ _armv4_AES_encrypt:
and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s2
eor $s1,$t1,$s1,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
eor $s1,$t1,$s1,lsl#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
mov $s2,$s2,lsr#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
eor $s0,$i1,$s0,lsl#8
ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
@ -346,15 +379,15 @@ _armv4_AES_encrypt:
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s3,lsr#16 @ i2
eor $s2,$t2,$s2,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
eor $s2,$t2,$s2,lsl#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
mov $s3,$s3,lsr#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
eor $s0,$i1,$s0,lsl#8
ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
ldr $i1,[$key,#0]
ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
eor $s1,$s1,$i2,lsl#8
ldr $t1,[$key,#4]
eor $s2,$s2,$i3,lsl#16
@ -371,11 +404,11 @@ _armv4_AES_encrypt:
ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
.global private_AES_set_encrypt_key
.type private_AES_set_encrypt_key,%function
.global AES_set_encrypt_key
.type AES_set_encrypt_key,%function
.align 5
private_AES_set_encrypt_key:
sub r3,pc,#8 @ private_AES_set_encrypt_key
AES_set_encrypt_key:
sub r3,pc,#8 @ AES_set_encrypt_key
teq r0,#0
moveq r0,#-1
beq .Labrt
@ -392,12 +425,13 @@ private_AES_set_encrypt_key:
bne .Labrt
.Lok: stmdb sp!,{r4-r12,lr}
sub $tbl,r3,#private_AES_set_encrypt_key-AES_Te-1024 @ Te4
sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4
mov $rounds,r0 @ inp
mov lr,r1 @ bits
mov $key,r2 @ key
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1]
@ -430,6 +464,22 @@ private_AES_set_encrypt_key:
orr $s3,$s3,$t3,lsl#24
str $s2,[$key,#-8]
str $s3,[$key,#-4]
#else
ldr $s0,[$rounds,#0]
ldr $s1,[$rounds,#4]
ldr $s2,[$rounds,#8]
ldr $s3,[$rounds,#12]
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
str $s0,[$key],#16
str $s1,[$key,#-12]
str $s2,[$key,#-8]
str $s3,[$key,#-4]
#endif
teq lr,#128
bne .Lnot128
@ -466,6 +516,7 @@ private_AES_set_encrypt_key:
b .Ldone
.Lnot128:
#if __ARM_ARCH__<7
ldrb $i2,[$rounds,#19]
ldrb $t1,[$rounds,#18]
ldrb $t2,[$rounds,#17]
@ -482,6 +533,16 @@ private_AES_set_encrypt_key:
str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4]
#else
ldr $i2,[$rounds,#16]
ldr $i3,[$rounds,#20]
#ifdef __ARMEL__
rev $i2,$i2
rev $i3,$i3
#endif
str $i2,[$key],#8
str $i3,[$key,#-4]
#endif
teq lr,#192
bne .Lnot192
@ -526,6 +587,7 @@ private_AES_set_encrypt_key:
b .L192_loop
.Lnot192:
#if __ARM_ARCH__<7
ldrb $i2,[$rounds,#27]
ldrb $t1,[$rounds,#26]
ldrb $t2,[$rounds,#25]
@ -542,6 +604,16 @@ private_AES_set_encrypt_key:
str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4]
#else
ldr $i2,[$rounds,#24]
ldr $i3,[$rounds,#28]
#ifdef __ARMEL__
rev $i2,$i2
rev $i3,$i3
#endif
str $i2,[$key],#8
str $i3,[$key,#-4]
#endif
mov $rounds,#14
str $rounds,[$key,#240-32]
@ -606,21 +678,21 @@ private_AES_set_encrypt_key:
.Labrt: tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.size AES_set_encrypt_key,.-AES_set_encrypt_key
.global private_AES_set_decrypt_key
.type private_AES_set_decrypt_key,%function
.global AES_set_decrypt_key
.type AES_set_decrypt_key,%function
.align 5
private_AES_set_decrypt_key:
AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr
bl private_AES_set_encrypt_key
bl AES_set_encrypt_key
teq r0,#0
ldrne lr,[sp],#4 @ pop lr
bne .Labrt
stmdb sp!,{r4-r12}
ldr $rounds,[r2,#240] @ private_AES_set_encrypt_key preserves r2,
ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
mov $key,r2 @ which is AES_KEY *key
mov $i1,r2
add $i2,r2,$rounds,lsl#4
@ -692,11 +764,15 @@ $code.=<<___;
bne .Lmix
mov r0,#0
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
#endif
.size AES_set_decrypt_key,.-AES_set_decrypt_key
.type AES_Td,%object
.align 5
@ -811,7 +887,7 @@ AES_decrypt:
mov $rounds,r0 @ inp
mov $key,r2
sub $tbl,r3,#AES_decrypt-AES_Td @ Td
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1]
@ -840,10 +916,33 @@ AES_decrypt:
orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24
#else
ldr $s0,[$rounds,#0]
ldr $s1,[$rounds,#4]
ldr $s2,[$rounds,#8]
ldr $s3,[$rounds,#12]
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
#endif
bl _armv4_AES_decrypt
ldr $rounds,[sp],#4 @ pop out
#if __ARM_ARCH__>=7
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
str $s0,[$rounds,#0]
str $s1,[$rounds,#4]
str $s2,[$rounds,#8]
str $s3,[$rounds,#12]
#else
mov $t1,$s0,lsr#24 @ write output in endian-neutral
mov $t2,$s0,lsr#16 @ manner...
mov $t3,$s0,lsr#8
@ -872,11 +971,15 @@ AES_decrypt:
strb $t2,[$rounds,#13]
strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15]
#endif
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size AES_decrypt,.-AES_decrypt
.type _armv4_AES_decrypt,%function
@ -916,11 +1019,11 @@ _armv4_AES_decrypt:
and $i2,lr,$s2 @ i1
eor $t3,$i3,$t3,ror#8
and $i3,lr,$s2,lsr#16
eor $s1,$s1,$t1,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
eor $s1,$s1,$t1,ror#8
ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
mov $s2,$s2,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
eor $s0,$s0,$i1,ror#16
ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
@ -929,22 +1032,22 @@ _armv4_AES_decrypt:
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,ror#8
and $i3,lr,$s3 @ i2
eor $s2,$s2,$t2,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
eor $s2,$s2,$t2,ror#8
ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
mov $s3,$s3,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
eor $s0,$s0,$i1,ror#8
ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s1,$s1,$i2,ror#16
eor $s2,$s2,$i3,ror#24
ldr $i1,[$key],#16
eor $s3,$s3,$t3,ror#8
eor $s1,$s1,$i2,ror#16
ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s2,$s2,$i3,ror#24
ldr $t1,[$key,#-12]
ldr $t2,[$key,#-8]
eor $s0,$s0,$i1
ldr $t2,[$key,#-8]
eor $s3,$s3,$t3,ror#8
ldr $t3,[$key,#-4]
and $i1,lr,$s0,lsr#16
eor $s1,$s1,$t1
@ -985,11 +1088,11 @@ _armv4_AES_decrypt:
and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$t2,$i2,lsl#8
and $i2,lr,$s2 @ i1
eor $t3,$t3,$i3,lsl#8
ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
eor $t3,$t3,$i3,lsl#8
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
and $i3,lr,$s2,lsr#16
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
eor $s0,$s0,$i1,lsl#8
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
@ -997,11 +1100,11 @@ _armv4_AES_decrypt:
and $i1,lr,$s3,lsr#16 @ i0
eor $s2,$t2,$s2,lsl#16
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$t3,$i3,lsl#16
ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
eor $t3,$t3,$i3,lsl#16
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
and $i3,lr,$s3 @ i2
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
eor $s0,$s0,$i1,lsl#16

51
crypto/arm_arch.h Normal file
View File

@ -0,0 +1,51 @@
#ifndef __ARM_ARCH_H__
#define __ARM_ARCH_H__
#if !defined(__ARM_ARCH__)
# if defined(__CC_ARM)
# define __ARM_ARCH__ __TARGET_ARCH_ARM
# if defined(__BIG_ENDIAN)
# define __ARMEB__
# else
# define __ARMEL__
# endif
# elif defined(__GNUC__)
/*
* Why doesn't gcc define __ARM_ARCH__? Instead it defines
* bunch of below macros. See all_architectires[] table in
* gcc/config/arm/arm.c. On a side note it defines
* __ARMEL__/__ARMEB__ for little-/big-endian.
*/
# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \
defined(__ARM_ARCH_7EM__)
# define __ARM_ARCH__ 7
# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \
defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \
defined(__ARM_ARCH_6T2__)
# define __ARM_ARCH__ 6
# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \
defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \
defined(__ARM_ARCH_5TEJ__)
# define __ARM_ARCH__ 5
# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
# define __ARM_ARCH__ 4
# else
# error "unsupported ARM architecture"
# endif
# endif
#endif
#ifdef OPENSSL_FIPSCANISTER
#include <openssl/fipssyms.h>
#endif
#if !__ASSEMBLER__
extern unsigned int OPENSSL_armcap_P;
#define ARMV7_NEON (1<<0)
#define ARMV7_TICK (1<<1)
#endif
#endif

80
crypto/armcap.c Normal file
View File

@ -0,0 +1,80 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <setjmp.h>
#include <signal.h>
#include <crypto.h>
#include "arm_arch.h"
unsigned int OPENSSL_armcap_P;
static sigset_t all_masked;
static sigjmp_buf ill_jmp;
static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
/*
* Following subroutines could have been inlined, but it's not all
* ARM compilers support inline assembler...
*/
void _armv7_neon_probe(void);
unsigned int _armv7_tick(void);
unsigned int OPENSSL_rdtsc(void)
{
if (OPENSSL_armcap_P|ARMV7_TICK)
return _armv7_tick();
else
return 0;
}
#if defined(__GNUC__) && __GNUC__>=2
void OPENSSL_cpuid_setup(void) __attribute__((constructor));
#endif
void OPENSSL_cpuid_setup(void)
{
char *e;
struct sigaction ill_oact,ill_act;
sigset_t oset;
static int trigger=0;
if (trigger) return;
trigger=1;
if ((e=getenv("OPENSSL_armcap")))
{
OPENSSL_armcap_P=strtoul(e,NULL,0);
return;
}
sigfillset(&all_masked);
sigdelset(&all_masked,SIGILL);
sigdelset(&all_masked,SIGTRAP);
sigdelset(&all_masked,SIGFPE);
sigdelset(&all_masked,SIGBUS);
sigdelset(&all_masked,SIGSEGV);
OPENSSL_armcap_P = 0;
memset(&ill_act,0,sizeof(ill_act));
ill_act.sa_handler = ill_handler;
ill_act.sa_mask = all_masked;
sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
sigaction(SIGILL,&ill_act,&ill_oact);
if (sigsetjmp(ill_jmp,1) == 0)
{
_armv7_neon_probe();
OPENSSL_armcap_P |= ARMV7_NEON;
}
if (sigsetjmp(ill_jmp,1) == 0)
{
_armv7_tick();
OPENSSL_armcap_P |= ARMV7_TICK;
}
sigaction (SIGILL,&ill_oact,NULL);
sigprocmask(SIG_SETMASK,&oset,NULL);
}

154
crypto/armv4cpuid.S Normal file
View File

@ -0,0 +1,154 @@
#include "arm_arch.h"
.text
.code 32
.align 5
.global _armv7_neon_probe
.type _armv7_neon_probe,%function
_armv7_neon_probe:
.word 0xf26ee1fe @ vorr q15,q15,q15
.word 0xe12fff1e @ bx lr
.size _armv7_neon_probe,.-_armv7_neon_probe
.global _armv7_tick
.type _armv7_tick,%function
_armv7_tick:
mrc p15,0,r0,c9,c13,0
.word 0xe12fff1e @ bx lr
.size _armv7_tick,.-_armv7_tick
.global OPENSSL_atomic_add
.type OPENSSL_atomic_add,%function
OPENSSL_atomic_add:
#if __ARM_ARCH__>=6
.Ladd: ldrex r2,[r0]
add r3,r2,r1
strex r2,r3,[r0]
cmp r2,#0
bne .Ladd
mov r0,r3
.word 0xe12fff1e @ bx lr
#else
stmdb sp!,{r4-r6,lr}
ldr r2,.Lspinlock
adr r3,.Lspinlock
mov r4,r0
mov r5,r1
add r6,r3,r2 @ &spinlock
b .+8
.Lspin: bl sched_yield
mov r0,#-1
swp r0,r0,[r6]
cmp r0,#0
bne .Lspin
ldr r2,[r4]
add r2,r2,r5
str r2,[r4]
str r0,[r6] @ release spinlock
ldmia sp!,{r4-r6,lr}
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
#endif
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
.global OPENSSL_cleanse
.type OPENSSL_cleanse,%function
OPENSSL_cleanse:
eor ip,ip,ip
cmp r1,#7
subhs r1,r1,#4
bhs .Lot
cmp r1,#0
beq .Lcleanse_done
.Little:
strb ip,[r0],#1
subs r1,r1,#1
bhi .Little
b .Lcleanse_done
.Lot: tst r0,#3
beq .Laligned
strb ip,[r0],#1
sub r1,r1,#1
b .Lot
.Laligned:
str ip,[r0],#4
subs r1,r1,#4
bhs .Laligned
adds r1,r1,#4
bne .Little
.Lcleanse_done:
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
.size OPENSSL_cleanse,.-OPENSSL_cleanse
.global OPENSSL_wipe_cpu
.type OPENSSL_wipe_cpu,%function
OPENSSL_wipe_cpu:
ldr r0,.LOPENSSL_armcap
adr r1,.LOPENSSL_armcap
ldr r0,[r1,r0]
eor r2,r2,r2
eor r3,r3,r3
eor ip,ip,ip
tst r0,#1
beq .Lwipe_done
.word 0xf3000150 @ veor q0, q0, q0
.word 0xf3022152 @ veor q1, q1, q1
.word 0xf3044154 @ veor q2, q2, q2
.word 0xf3066156 @ veor q3, q3, q3
.word 0xf34001f0 @ veor q8, q8, q8
.word 0xf34221f2 @ veor q9, q9, q9
.word 0xf34441f4 @ veor q10, q10, q10
.word 0xf34661f6 @ veor q11, q11, q11
.word 0xf34881f8 @ veor q12, q12, q12
.word 0xf34aa1fa @ veor q13, q13, q13
.word 0xf34cc1fc @ veor q14, q14, q14
.word 0xf34ee1fe @ veor q15, q15, q15
.Lwipe_done:
mov r0,sp
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
.global OPENSSL_instrument_bus
.type OPENSSL_instrument_bus,%function
OPENSSL_instrument_bus:
eor r0,r0,r0
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
.global OPENSSL_instrument_bus2
.type OPENSSL_instrument_bus2,%function
OPENSSL_instrument_bus2:
eor r0,r0,r0
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.LOPENSSL_armcap
#if __ARM_ARCH__>=6
.align 5
#else
.Lspinlock:
.word atomic_add_spinlock-.Lspinlock
.align 5
.data
.align 2
atomic_add_spinlock:
.word 0
#endif
.comm OPENSSL_armcap_P,4,4
.hidden OPENSSL_armcap_P

278
crypto/bn/asm/armv4-gf2m.pl Normal file
View File

@ -0,0 +1,278 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
# C for the time being... Except that it has two code paths: pure
# integer code suitable for any ARMv4 and later CPU and NEON code
# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
# faster than compiler-generated code. For ECDH and ECDSA verify (but
# not for ECDSA sign) it means 25%-45% improvement depending on key
# length, more for longer keys. Even though NEON 1x1 multiplication
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
$code=<<___;
#include "arm_arch.h"
.text
.code 32
#if __ARM_ARCH__>=7
.fpu neon
.type mul_1x1_neon,%function
.align 5
mul_1x1_neon:
vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
vmull.p8 `&Q("d0")`,d16,d17 @ a·bb
vshl.u64 `&Dlo("q2")`,d16,#16
vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb
vshl.u64 `&Dlo("q3")`,d16,#24
vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb
vshr.u64 `&Dlo("q1")`,#8
vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb
vshl.u64 `&Dhi("q1")`,#24
veor d0,`&Dlo("q1")`
vshr.u64 `&Dlo("q2")`,#16
veor d0,`&Dhi("q1")`
vshl.u64 `&Dhi("q2")`,#16
veor d0,`&Dlo("q2")`
vshr.u64 `&Dlo("q3")`,#24
veor d0,`&Dhi("q2")`
vshl.u64 `&Dhi("q3")`,#8
veor d0,`&Dlo("q3")`
veor d0,`&Dhi("q3")`
bx lr
.size mul_1x1_neon,.-mul_1x1_neon
#endif
___
################
# private interface to mul_1x1_ialu
#
$a="r1";
$b="r0";
($a0,$a1,$a2,$a12,$a4,$a14)=
($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
$mask="r12";
$code.=<<___;
.type mul_1x1_ialu,%function
.align 5
mul_1x1_ialu:
mov $a0,#0
bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
str $a0,[sp,#0] @ tab[0]=0
add $a2,$a1,$a1 @ a2=a1<<1
str $a1,[sp,#4] @ tab[1]=a1
eor $a12,$a1,$a2 @ a1^a2
str $a2,[sp,#8] @ tab[2]=a2
mov $a4,$a1,lsl#2 @ a4=a1<<2
str $a12,[sp,#12] @ tab[3]=a1^a2
eor $a14,$a1,$a4 @ a1^a4
str $a4,[sp,#16] @ tab[4]=a4
eor $a0,$a2,$a4 @ a2^a4
str $a14,[sp,#20] @ tab[5]=a1^a4
eor $a12,$a12,$a4 @ a1^a2^a4
str $a0,[sp,#24] @ tab[6]=a2^a4
and $i0,$mask,$b,lsl#2
str $a12,[sp,#28] @ tab[7]=a1^a2^a4
and $i1,$mask,$b,lsr#1
ldr $lo,[sp,$i0] @ tab[b & 0x7]
and $i0,$mask,$b,lsr#4
ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
and $i1,$mask,$b,lsr#7
ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
eor $lo,$lo,$t1,lsl#3 @ stall
mov $hi,$t1,lsr#29
ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
and $i0,$mask,$b,lsr#10
eor $lo,$lo,$t0,lsl#6
eor $hi,$hi,$t0,lsr#26
ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
and $i1,$mask,$b,lsr#13
eor $lo,$lo,$t1,lsl#9
eor $hi,$hi,$t1,lsr#23
ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
and $i0,$mask,$b,lsr#16
eor $lo,$lo,$t0,lsl#12
eor $hi,$hi,$t0,lsr#20
ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
and $i1,$mask,$b,lsr#19
eor $lo,$lo,$t1,lsl#15
eor $hi,$hi,$t1,lsr#17
ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
and $i0,$mask,$b,lsr#22
eor $lo,$lo,$t0,lsl#18
eor $hi,$hi,$t0,lsr#14
ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
and $i1,$mask,$b,lsr#25
eor $lo,$lo,$t1,lsl#21
eor $hi,$hi,$t1,lsr#11
ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
tst $a,#1<<30
and $i0,$mask,$b,lsr#28
eor $lo,$lo,$t0,lsl#24
eor $hi,$hi,$t0,lsr#8
ldr $t0,[sp,$i0] @ tab[b >> 30 ]
eorne $lo,$lo,$b,lsl#30
eorne $hi,$hi,$b,lsr#2
tst $a,#1<<31
eor $lo,$lo,$t1,lsl#27
eor $hi,$hi,$t1,lsr#5
eorne $lo,$lo,$b,lsl#31
eorne $hi,$hi,$b,lsr#1
eor $lo,$lo,$t0,lsl#30
eor $hi,$hi,$t0,lsr#2
mov pc,lr
.size mul_1x1_ialu,.-mul_1x1_ialu
___
################
# void bn_GF2m_mul_2x2(BN_ULONG *r,
# BN_ULONG a1,BN_ULONG a0,
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
$code.=<<___;
.global bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function
.align 5
bn_GF2m_mul_2x2:
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
.Lpic: ldr r12,[pc,r12]
tst r12,#1
beq .Lialu
veor $A1,$A1
vmov.32 $B1,r3,r3 @ two copies of b1
vmov.32 ${A1}[0],r1 @ a1
veor $A0,$A0
vld1.32 ${B0}[],[sp,:32] @ two copies of b0
vmov.32 ${A0}[0],r2 @ a0
mov r12,lr
vmov d16,$A1
vmov d17,$B1
bl mul_1x1_neon @ a1·b1
vmov $A1B1,d0
vmov d16,$A0
vmov d17,$B0
bl mul_1x1_neon @ a0·b0
vmov $A0B0,d0
veor d16,$A0,$A1
veor d17,$B0,$B1
veor $A0,$A0B0,$A1B1
bl mul_1x1_neon @ (a0+a1)·(b0+b1)
veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
vshl.u64 d1,d0,#32
vshr.u64 d0,d0,#32
veor $A0B0,d1
veor $A1B1,d0
vst1.32 {${A0B0}[0]},[r0,:32]!
vst1.32 {${A0B0}[1]},[r0,:32]!
vst1.32 {${A1B1}[0]},[r0,:32]!
vst1.32 {${A1B1}[1]},[r0,:32]
bx r12
.align 4
.Lialu:
#endif
___
$ret="r10"; # reassigned 1st argument
$code.=<<___;
stmdb sp!,{r4-r10,lr}
mov $ret,r0 @ reassign 1st argument
mov $b,r3 @ $b=b1
ldr r3,[sp,#32] @ load b0
mov $mask,#7<<2
sub sp,sp,#32 @ allocate tab[8]
bl mul_1x1_ialu @ a1·b1
str $lo,[$ret,#8]
str $hi,[$ret,#12]
eor $b,$b,r3 @ flip b0 and b1
eor $a,$a,r2 @ flip a0 and a1
eor r3,r3,$b
eor r2,r2,$a
eor $b,$b,r3
eor $a,$a,r2
bl mul_1x1_ialu @ a0·b0
str $lo,[$ret]
str $hi,[$ret,#4]
eor $a,$a,r2
eor $b,$b,r3
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
___
@r=map("r$_",(6..9));
$code.=<<___;
ldmia $ret,{@r[0]-@r[3]}
eor $lo,$lo,$hi
eor $hi,$hi,@r[1]
eor $lo,$lo,@r[0]
eor $hi,$hi,@r[2]
eor $lo,$lo,@r[3]
eor $hi,$hi,@r[3]
str $hi,[$ret,#8]
eor $lo,$lo,$hi
add sp,sp,#32 @ destroy tab[8]
str $lo,[$ret,#4]
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r10,pc}
#else
ldmia sp!,{r4-r10,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
#if __ARM_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-(.Lpic+8)
#endif
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
.comm OPENSSL_armcap_P,4,4
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
print $code;
close STDOUT; # enforce flush

View File

@ -23,6 +23,9 @@
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
$bp="r2"; $bi="r2"; $rp="r2";
@ -89,9 +92,9 @@ bn_mul_mont:
.L1st:
ldr $aj,[$ap],#4 @ ap[j],ap++
mov $alo,$ahi
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
ldr $nj,[$np],#4 @ np[j],np++
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adds $nlo,$nlo,$alo
@ -101,21 +104,21 @@ bn_mul_mont:
bne .L1st
adds $nlo,$nlo,$ahi
mov $nhi,#0
adc $nhi,$nhi,#0
ldr $tp,[$_bp] @ restore bp
str $nlo,[$num] @ tp[num-1]=
mov $nhi,#0
ldr $n0,[$_n0] @ restore n0
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
.Louter:
sub $tj,$num,sp @ "original" $num-1 value
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $bi,[$tp,#4]! @ *(++bp)
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $aj,[$ap,#-4] @ ap[0]
ldr $nj,[$np,#-4] @ np[0]
ldr $alo,[sp] @ tp[0]
ldr $nj,[$np,#-4] @ np[0]
ldr $tj,[sp,#4] @ tp[1]
mov $ahi,#0
@ -129,13 +132,13 @@ bn_mul_mont:
.Linner:
ldr $aj,[$ap],#4 @ ap[j],ap++
adds $alo,$ahi,$tj @ +=tp[j]
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
ldr $nj,[$np],#4 @ np[j],np++
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
ldr $tj,[$tp,#8] @ tp[j+1]
adc $ahi,$ahi,#0
ldr $tj,[$tp,#8] @ tp[j+1]
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
@ -144,13 +147,13 @@ bn_mul_mont:
adds $nlo,$nlo,$ahi
mov $nhi,#0
adc $nhi,$nhi,#0
adds $nlo,$nlo,$tj
adc $nhi,$nhi,#0
ldr $tp,[$_bp] @ restore bp
ldr $tj,[$_bpend] @ restore &bp[num]
str $nlo,[$num] @ tp[num-1]=
adc $nhi,$nhi,#0
ldr $n0,[$_n0] @ restore n0
adds $nlo,$nlo,$tj
ldr $tj,[$_bpend] @ restore &bp[num]
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
cmp $tp,$tj

View File

@ -47,6 +47,10 @@
# Cortex A8 core and in absolute terms ~870 cycles per input block
# [or 13.6 cycles per byte].
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@ -76,31 +80,41 @@ $code.=<<___;
add $e,$K,$e,ror#2 @ E+=K_xx_xx
ldr $t3,[$Xi,#2*4]
eor $t0,$t0,$t1
eor $t2,$t2,$t3
eor $t2,$t2,$t3 @ 1 cycle stall
eor $t1,$c,$d @ F_xx_xx
mov $t0,$t0,ror#31
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
eor $t0,$t0,$t2,ror#31
str $t0,[$Xi,#-4]!
$opt1 @ F_xx_xx
$opt2 @ F_xx_xx
add $e,$e,$t0 @ E+=X[i]
str $t0,[$Xi,#-4]!
___
}
sub BODY_00_15 {
my ($a,$b,$c,$d,$e)=@_;
$code.=<<___;
ldrb $t0,[$inp],#4
ldrb $t1,[$inp,#-1]
ldrb $t2,[$inp,#-2]
#if __ARM_ARCH__<7
ldrb $t1,[$inp,#2]
ldrb $t0,[$inp,#3]
ldrb $t2,[$inp,#1]
add $e,$K,$e,ror#2 @ E+=K_00_19
ldrb $t3,[$inp,#-3]
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
orr $t0,$t1,$t0,lsl#24
ldrb $t3,[$inp],#4
orr $t0,$t0,$t1,lsl#8
eor $t1,$c,$d @ F_xx_xx
orr $t0,$t0,$t2,lsl#8
orr $t0,$t0,$t3,lsl#16
orr $t0,$t0,$t2,lsl#16
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
orr $t0,$t0,$t3,lsl#24
#else
ldr $t0,[$inp],#4 @ handles unaligned
add $e,$K,$e,ror#2 @ E+=K_00_19
eor $t1,$c,$d @ F_xx_xx
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev $t0,$t0 @ byte swap
#endif
#endif
and $t1,$b,$t1,ror#2
add $e,$e,$t0 @ E+=X[i]
eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
@ -136,6 +150,8 @@ ___
}
$code=<<___;
#include "arm_arch.h"
.text
.global sha1_block_data_order
@ -209,10 +225,14 @@ $code.=<<___;
teq $inp,$len
bne .Lloop @ [+18], total 1307
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.align 2
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1

View File

@ -18,11 +18,16 @@
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~17 cycles per processed byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0"; $t0="r0";
$inp="r1";
$inp="r1"; $t3="r1";
$len="r2"; $t1="r2";
$T1="r3";
$A="r4";
@ -46,6 +51,9 @@ sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
#if __ARM_ARCH__>=7
ldr $T1,[$inp],#4
#else
ldrb $T1,[$inp,#3] @ $i
ldrb $t2,[$inp,#2]
ldrb $t1,[$inp,#1]
@ -53,16 +61,24 @@ $code.=<<___ if ($i<16);
orr $T1,$T1,$t2,lsl#8
orr $T1,$T1,$t1,lsl#16
orr $T1,$T1,$t0,lsl#24
`"str $inp,[sp,#17*4]" if ($i==15)`
#endif
___
$code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++
mov $t0,$e,ror#$Sigma1[0]
str $T1,[sp,#`$i%16`*4]
ldr $t2,[$Ktbl],#4 @ *K256++
eor $t0,$t0,$e,ror#$Sigma1[1]
eor $t1,$f,$g
#if $i>=16
add $T1,$T1,$t3 @ from BODY_16_xx
#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
rev $T1,$T1
#endif
#if $i==15
str $inp,[sp,#17*4] @ leave room for $t3
#endif
eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
and $t1,$t1,$e
str $T1,[sp,#`$i%16`*4]
add $T1,$T1,$t0
eor $t1,$t1,$g @ Ch(e,f,g)
add $T1,$T1,$h
@ -71,6 +87,9 @@ $code.=<<___;
eor $h,$h,$a,ror#$Sigma0[1]
add $T1,$T1,$t2
eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
#if $i>=15
ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
#endif
orr $t0,$a,$b
and $t1,$a,$b
and $t0,$t0,$c
@ -85,24 +104,26 @@ sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
ldr $t1,[sp,#`($i+1)%16`*4] @ $i
@ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
ldr $t2,[sp,#`($i+14)%16`*4]
mov $t0,$t3,ror#$sigma0[0]
ldr $T1,[sp,#`($i+0)%16`*4]
mov $t0,$t1,ror#$sigma0[0]
ldr $inp,[sp,#`($i+9)%16`*4]
eor $t0,$t0,$t1,ror#$sigma0[1]
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
mov $t1,$t2,ror#$sigma1[0]
eor $t0,$t0,$t3,ror#$sigma0[1]
ldr $t1,[sp,#`($i+9)%16`*4]
eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
mov $t3,$t2,ror#$sigma1[0]
add $T1,$T1,$t0
eor $t1,$t1,$t2,ror#$sigma1[1]
add $T1,$T1,$inp
eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
eor $t3,$t3,$t2,ror#$sigma1[1]
add $T1,$T1,$t1
eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
@ add $T1,$T1,$t3
___
&BODY_00_15(@_);
}
$code=<<___;
#include "arm_arch.h"
.text
.code 32
@ -132,7 +153,7 @@ K256:
sha256_block_data_order:
sub r3,pc,#8 @ sha256_block_data_order
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r12,lr}
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
sub $Ktbl,r3,#256 @ K256
sub sp,sp,#16*4 @ alloca(X[16])
@ -171,10 +192,14 @@ $code.=<<___;
bne .Loop
add sp,sp,#`16+3`*4 @ destroy frame
ldmia sp!,{r4-r12,lr}
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha256_block_data_order,.-sha256_block_data_order
.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align 2

View File

@ -18,22 +18,33 @@
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Coxtex A8 core and ~38 cycles per byte.
# March 2011.
#
# Add NEON implementation. On Cortex A8 it was measured to process
# one byte in 25.5 cycles or 47% faster than integer-only code.
# Byte order [in]dependence. =========================================
#
# Caller is expected to maintain specific *dword* order in h[0-7],
# namely with most significant dword at *lower* address, which is
# reflected in below two parameters. *Byte* order within these dwords
# in turn is whatever *native* byte order on current platform.
$hi=0;
$lo=4;
# Originally caller was expected to maintain specific *dword* order in
# h[0-7], namely with most significant dword at *lower* address, which
# was reflected in below two parameters as 0 and 4. Now caller is
# expected to maintain native byte order for whole 64-bit values.
$hi="HI";
$lo="LO";
# ====================================================================
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0";
$ctx="r0"; # parameter block
$inp="r1";
$len="r2";
$Tlo="r3";
$Thi="r4";
$Alo="r5";
@ -61,15 +72,17 @@ $Xoff=8*8;
sub BODY_00_15() {
my $magic = shift;
$code.=<<___;
ldr $t2,[sp,#$Hoff+0] @ h.lo
ldr $t3,[sp,#$Hoff+4] @ h.hi
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
mov $t0,$Elo,lsr#14
str $Tlo,[sp,#$Xoff+0]
mov $t1,$Ehi,lsr#14
str $Thi,[sp,#$Xoff+4]
eor $t0,$t0,$Ehi,lsl#18
ldr $t2,[sp,#$Hoff+0] @ h.lo
eor $t1,$t1,$Elo,lsl#18
ldr $t3,[sp,#$Hoff+4] @ h.hi
eor $t0,$t0,$Elo,lsr#18
eor $t1,$t1,$Ehi,lsr#18
eor $t0,$t0,$Ehi,lsl#14
@ -96,25 +109,24 @@ $code.=<<___;
and $t1,$t1,$Ehi
str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
ldr $t2,[$Ktbl,#4] @ K[i].lo
ldr $t2,[$Ktbl,#$lo] @ K[i].lo
eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#0] @ K[i].hi
ldr $t3,[$Ktbl,#$hi] @ K[i].hi
adds $Tlo,$Tlo,$t0
ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2
and $t0,$t2,#0xff
adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo
adc $Ehi,$Ehi,$Thi @ d += T
and $t0,$t2,#0xff
teq $t0,#$magic
orreq $Ktbl,$Ktbl,#1
ldr $t2,[sp,#$Boff+0] @ b.lo
adc $Ehi,$Ehi,$Thi @ d += T
teq $t0,#$magic
ldr $t3,[sp,#$Coff+0] @ c.lo
orreq $Ktbl,$Ktbl,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@ -131,80 +143,100 @@ $code.=<<___;
eor $t0,$t0,$Alo,lsl#25
eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
adds $Tlo,$Tlo,$t0
and $t0,$Alo,$t2
adc $Thi,$Thi,$t1 @ T += Sigma0(a)
and $t0,$Alo,$t2
orr $Alo,$Alo,$t2
ldr $t1,[sp,#$Boff+4] @ b.hi
orr $Alo,$Alo,$t2
ldr $t2,[sp,#$Coff+4] @ c.hi
and $Alo,$Alo,$t3
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $t3,$Ahi,$t1
orr $Ahi,$Ahi,$t1
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $Ahi,$Ahi,$t2
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
adds $Alo,$Alo,$Tlo
adc $Ahi,$Ahi,$Thi @ h += T
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
sub sp,sp,#8
adc $Ahi,$Ahi,$Thi @ h += T
tst $Ktbl,#1
add $Ktbl,$Ktbl,#8
___
}
$code=<<___;
#include "arm_arch.h"
#ifdef __ARMEL__
# define LO 0
# define HI 4
# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
#else
# define HI 0
# define LO 4
# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
#endif
.text
.code 32
.type K512,%object
.align 5
K512:
.word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
.word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
.word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
.word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
.word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
.word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
.word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
.word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
.word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
.word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
.word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
.word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
.word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
.word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
.word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
.word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
.word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
.word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
.word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
.word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
.word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
.word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
.word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
.word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
.word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
.word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
.word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
.word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
.word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
.word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
.word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
.word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
.word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
.word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
.word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
.word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
.word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
.word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
.word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
.word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha512_block_data_order
.skip 32-4
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
sub r3,pc,#8 @ sha512_block_data_order
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#1
bne .LNEON
#endif
stmdb sp!,{r4-r12,lr}
sub $Ktbl,r3,#640 @ K512
sub $Ktbl,r3,#672 @ K512
sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo]
@ -238,6 +270,7 @@ sha512_block_data_order:
str $Thi,[sp,#$Foff+4]
.L00_15:
#if __ARM_ARCH__<7
ldrb $Tlo,[$inp,#7]
ldrb $t0, [$inp,#6]
ldrb $t1, [$inp,#5]
@ -252,26 +285,30 @@ sha512_block_data_order:
orr $Thi,$Thi,$t3,lsl#8
orr $Thi,$Thi,$t0,lsl#16
orr $Thi,$Thi,$t1,lsl#24
str $Tlo,[sp,#$Xoff+0]
str $Thi,[sp,#$Xoff+4]
#else
ldr $Tlo,[$inp,#4]
ldr $Thi,[$inp],#8
#ifdef __ARMEL__
rev $Tlo,$Tlo
rev $Thi,$Thi
#endif
#endif
___
&BODY_00_15(0x94);
$code.=<<___;
tst $Ktbl,#1
beq .L00_15
bic $Ktbl,$Ktbl,#1
.L16_79:
ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
bic $Ktbl,$Ktbl,#1
.L16_79:
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
mov $Tlo,$t0,lsr#1
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
mov $Thi,$t1,lsr#1
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
eor $Tlo,$Tlo,$t1,lsl#31
eor $Thi,$Thi,$t0,lsl#31
eor $Tlo,$Tlo,$t0,lsr#8
@ -295,25 +332,24 @@ $code.=<<___;
eor $t1,$t1,$t3,lsl#3
eor $t0,$t0,$t2,lsr#6
eor $t1,$t1,$t3,lsr#6
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
eor $t0,$t0,$t3,lsl#26
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#`$Xoff+8*16`+0]
adc $Thi,$Thi,$t1
ldr $t0,[sp,#`$Xoff+8*16`+0]
ldr $t1,[sp,#`$Xoff+8*16`+4]
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3
adds $Tlo,$Tlo,$t0
adc $Thi,$Thi,$t1
str $Tlo,[sp,#$Xoff+0]
str $Thi,[sp,#$Xoff+4]
___
&BODY_00_15(0x17);
$code.=<<___;
tst $Ktbl,#1
ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
beq .L16_79
bic $Ktbl,$Ktbl,#1
@ -324,12 +360,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Boff+$lo]
ldr $t3, [$ctx,#$Boff+$hi]
adds $t0,$Alo,$t0
adc $t1,$Ahi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $t0, [$ctx,#$Aoff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Aoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Boff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Boff+$hi]
ldr $Alo,[sp,#$Coff+0]
@ -341,12 +377,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
adds $t0,$Alo,$t0
adc $t1,$Ahi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $t0, [$ctx,#$Coff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Coff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Doff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Doff+$hi]
ldr $Tlo,[sp,#$Foff+0]
@ -356,12 +392,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Foff+$lo]
ldr $t3, [$ctx,#$Foff+$hi]
adds $Elo,$Elo,$t0
adc $Ehi,$Ehi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $Elo,[$ctx,#$Eoff+$lo]
adc $Ehi,$Ehi,$t1
str $Ehi,[$ctx,#$Eoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Foff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Foff+$hi]
ldr $Alo,[sp,#$Goff+0]
@ -373,12 +409,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
adds $t0,$Alo,$t0
adc $t1,$Ahi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $t0, [$ctx,#$Goff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Goff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Hoff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Hoff+$hi]
add sp,sp,#640
@ -388,13 +424,156 @@ $code.=<<___;
bne .Loop
add sp,sp,#8*9 @ destroy frame
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
.size sha512_block_data_order,.-sha512_block_data_order
.asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
#endif
___
{
my @Sigma0=(28,34,39);
my @Sigma1=(14,18,41);
my @sigma0=(1, 8, 7);
my @sigma1=(19,61,6);
my $Ktbl="r3";
my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
my @X=map("d$_",(0..15));
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
sub NEON_00_15() {
my $i=shift;
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
$code.=<<___ if ($i<16 || $i&1);
vshr.u64 $t0,$e,#@Sigma1[0] @ $i
#if $i<16
vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
#endif
vshr.u64 $t1,$e,#@Sigma1[1]
vshr.u64 $t2,$e,#@Sigma1[2]
___
$code.=<<___;
vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
vsli.64 $t0,$e,#`64-@Sigma1[0]`
vsli.64 $t1,$e,#`64-@Sigma1[1]`
vsli.64 $t2,$e,#`64-@Sigma1[2]`
#if $i<16 && defined(__ARMEL__)
vrev64.8 @X[$i],@X[$i]
#endif
vadd.i64 $T1,$K,$h
veor $Ch,$f,$g
veor $t0,$t1
vand $Ch,$e
veor $t0,$t2 @ Sigma1(e)
veor $Ch,$g @ Ch(e,f,g)
vadd.i64 $T1,$t0
vshr.u64 $t0,$a,#@Sigma0[0]
vadd.i64 $T1,$Ch
vshr.u64 $t1,$a,#@Sigma0[1]
vshr.u64 $t2,$a,#@Sigma0[2]
vsli.64 $t0,$a,#`64-@Sigma0[0]`
vsli.64 $t1,$a,#`64-@Sigma0[1]`
vsli.64 $t2,$a,#`64-@Sigma0[2]`
vadd.i64 $T1,@X[$i%16]
vorr $Maj,$a,$c
vand $Ch,$a,$c
veor $h,$t0,$t1
vand $Maj,$b
veor $h,$t2 @ Sigma0(a)
vorr $Maj,$Ch @ Maj(a,b,c)
vadd.i64 $h,$T1
vadd.i64 $d,$T1
vadd.i64 $h,$Maj
___
}
sub NEON_16_79() {
my $i=shift;
if ($i&1) { &NEON_00_15($i,@_); return; }
# 2x-vectorized, therefore runs every 2nd round
my @X=map("q$_",(0..7)); # view @X as 128-bit vector
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
my $e=@_[4]; # $e from NEON_00_15
$i /= 2;
$code.=<<___;
vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
veor $s1,$t0
vshr.u64 $t0,$s0,#@sigma0[0]
veor $s1,$t1 @ sigma1(X[i+14])
vshr.u64 $t1,$s0,#@sigma0[1]
vadd.i64 @X[$i%8],$s1
vshr.u64 $s1,$s0,#@sigma0[2]
vsli.64 $t0,$s0,#`64-@sigma0[0]`
vsli.64 $t1,$s0,#`64-@sigma0[1]`
vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
veor $s1,$t0
vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
vadd.i64 @X[$i%8],$s0
vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
veor $s1,$t1 @ sigma0(X[i+1])
vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
vadd.i64 @X[$i%8],$s1
___
&NEON_00_15(2*$i,@_);
}
$code.=<<___;
#if __ARM_ARCH__>=7
.fpu neon
.align 4
.LNEON:
dmb @ errata #451034 on early Cortex A8
vstmdb sp!,{d8-d15} @ ABI specification says so
sub $Ktbl,r3,#672 @ K512
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___
for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
mov $cnt,#4
.L16_79_neon:
subs $cnt,#1
___
for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bne .L16_79_neon
vldmia $ctx,{d24-d31} @ load context to temp
vadd.i64 q8,q12 @ vectorized accumulate
vadd.i64 q9,q13
vadd.i64 q10,q14
vadd.i64 q11,q15
vstmia $ctx,{$A-$H} @ save context
teq $inp,$len
sub $Ktbl,#640 @ rewind K512
bne .Loop_neon
vldmia sp!,{d8-d15} @ epilogue
bx lr
#endif
___
}
$code.=<<___;
.size sha512_block_data_order,.-sha512_block_data_order
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
.comm OPENSSL_armcap_P,4,4
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;

View File

@ -61,19 +61,6 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
fips_md_init_ctx(SHA384, SHA512)
{
#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
/* maintain dword order required by assembler module */
unsigned int *h = (unsigned int *)c->h;
h[0] = 0xcbbb9d5d; h[1] = 0xc1059ed8;
h[2] = 0x629a292a; h[3] = 0x367cd507;
h[4] = 0x9159015a; h[5] = 0x3070dd17;
h[6] = 0x152fecd8; h[7] = 0xf70e5939;
h[8] = 0x67332667; h[9] = 0xffc00b31;
h[10] = 0x8eb44a87; h[11] = 0x68581511;
h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
#else
c->h[0]=U64(0xcbbb9d5dc1059ed8);
c->h[1]=U64(0x629a292a367cd507);
c->h[2]=U64(0x9159015a3070dd17);
@ -82,7 +69,7 @@ fips_md_init_ctx(SHA384, SHA512)
c->h[5]=U64(0x8eb44a8768581511);
c->h[6]=U64(0xdb0c2e0d64f98fa7);
c->h[7]=U64(0x47b5481dbefa4fa4);
#endif
c->Nl=0; c->Nh=0;
c->num=0; c->md_len=SHA384_DIGEST_LENGTH;
return 1;
@ -90,19 +77,6 @@ fips_md_init_ctx(SHA384, SHA512)
fips_md_init(SHA512)
{
#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
/* maintain dword order required by assembler module */
unsigned int *h = (unsigned int *)c->h;
h[0] = 0x6a09e667; h[1] = 0xf3bcc908;
h[2] = 0xbb67ae85; h[3] = 0x84caa73b;
h[4] = 0x3c6ef372; h[5] = 0xfe94f82b;
h[6] = 0xa54ff53a; h[7] = 0x5f1d36f1;
h[8] = 0x510e527f; h[9] = 0xade682d1;
h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
h[14] = 0x5be0cd19; h[15] = 0x137e2179;
#else
c->h[0]=U64(0x6a09e667f3bcc908);
c->h[1]=U64(0xbb67ae8584caa73b);
c->h[2]=U64(0x3c6ef372fe94f82b);
@ -111,7 +85,7 @@ fips_md_init(SHA512)
c->h[5]=U64(0x9b05688c2b3e6c1f);
c->h[6]=U64(0x1f83d9abfb41bd6b);
c->h[7]=U64(0x5be0cd19137e2179);
#endif
c->Nl=0; c->Nh=0;
c->num=0; c->md_len=SHA512_DIGEST_LENGTH;
return 1;
@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
if (md==0) return 0;
#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
/* recall assembler dword order... */
n = c->md_len;
if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
{
unsigned int *h = (unsigned int *)c->h, t;
for (n/=4;n;n--)
{
t = *(h++);
*(md++) = (unsigned char)(t>>24);
*(md++) = (unsigned char)(t>>16);
*(md++) = (unsigned char)(t>>8);
*(md++) = (unsigned char)(t);
}
}
else return 0;
#else
switch (c->md_len)
{
/* Let compiler decide if it's appropriate to unroll... */
@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
/* ... as well as make sure md_len is not abused. */
default: return 0;
}
#endif
return 1;
}