s390x assembler pack update.
This commit is contained in:
parent
c23632d3f1
commit
8626230a02
@ -131,7 +131,7 @@ my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-
|
|||||||
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void";
|
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void";
|
||||||
my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::::::::void";
|
my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::::::::void";
|
||||||
my $mips3_asm=":bn-mips3.o::::::::::::void";
|
my $mips3_asm=":bn-mips3.o::::::::::::void";
|
||||||
my $s390x_asm=":bn-s390x.o::aes_cbc.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o:::::::void";
|
my $s390x_asm="s390xcpuid.o:bn-s390x.o s390x-mont.o::aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o:::::::void";
|
||||||
my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::void";
|
my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::void";
|
||||||
my $ppc32_asm="ppccpuid.o:bn-ppc.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::";
|
my $ppc32_asm="ppccpuid.o:bn-ppc.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::";
|
||||||
my $ppc64_asm="ppccpuid.o:bn-ppc.o ppc-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::";
|
my $ppc64_asm="ppccpuid.o:bn-ppc.o ppc-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::";
|
||||||
|
6
TABLE
6
TABLE
@ -3542,10 +3542,10 @@ $thread_cflag = -D_REENTRANT
|
|||||||
$sys_id =
|
$sys_id =
|
||||||
$lflags = -ldl
|
$lflags = -ldl
|
||||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
|
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
|
||||||
$cpuid_obj =
|
$cpuid_obj = s390xcpuid.o
|
||||||
$bn_obj = bn-s390x.o
|
$bn_obj = bn-s390x.o s390x-mont.o
|
||||||
$des_obj =
|
$des_obj =
|
||||||
$aes_obj = aes_cbc.o aes-s390x.o
|
$aes_obj = aes-s390x.o
|
||||||
$bf_obj =
|
$bf_obj =
|
||||||
$md5_obj =
|
$md5_obj =
|
||||||
$sha1_obj = sha1-s390x.o sha256-s390x.o sha512-s390x.o
|
$sha1_obj = sha1-s390x.o sha256-s390x.o sha512-s390x.o
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -27,6 +27,11 @@
|
|||||||
# module performance by implementing dedicated squaring code-path and
|
# module performance by implementing dedicated squaring code-path and
|
||||||
# possibly by unrolling loops...
|
# possibly by unrolling loops...
|
||||||
|
|
||||||
|
# January 2009.
|
||||||
|
#
|
||||||
|
# Reschedule to minimize/avoid Address Generation Interlock hazard,
|
||||||
|
# make inner loops counter-based.
|
||||||
|
|
||||||
$mn0="%r0";
|
$mn0="%r0";
|
||||||
$num="%r1";
|
$num="%r1";
|
||||||
|
|
||||||
@ -47,7 +52,7 @@ $nhi="%r10";
|
|||||||
$nlo="%r11";
|
$nlo="%r11";
|
||||||
$AHI="%r12";
|
$AHI="%r12";
|
||||||
$NHI="%r13";
|
$NHI="%r13";
|
||||||
$fp="%r14";
|
$count="%r14";
|
||||||
$sp="%r15";
|
$sp="%r15";
|
||||||
|
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
@ -57,44 +62,46 @@ $code.=<<___;
|
|||||||
bn_mul_mont:
|
bn_mul_mont:
|
||||||
lgf $num,164($sp) # pull $num
|
lgf $num,164($sp) # pull $num
|
||||||
sla $num,3 # $num to enumerate bytes
|
sla $num,3 # $num to enumerate bytes
|
||||||
la $rp,0($num,$rp) # pointers to point at the vectors' ends
|
|
||||||
la $ap,0($num,$ap)
|
|
||||||
la $bp,0($num,$bp)
|
la $bp,0($num,$bp)
|
||||||
la $np,0($num,$np)
|
|
||||||
|
|
||||||
stmg %r2,%r15,16($sp)
|
stmg %r2,%r15,16($sp)
|
||||||
|
|
||||||
cghi $num,16 #
|
cghi $num,16 #
|
||||||
lghi %r2,0 #
|
lghi %r2,0 #
|
||||||
blr %r14 # if($num<16) return 0;
|
blr %r14 # if($num<16) return 0;
|
||||||
|
cghi $num,128 #
|
||||||
|
bhr %r14 # if($num>128) return 0;
|
||||||
|
|
||||||
lcgr $num,$num # -$num
|
lghi $rp,-160-8 # leave room for carry bit
|
||||||
|
lcgr $j,$num # -$num
|
||||||
lgr %r0,$sp
|
lgr %r0,$sp
|
||||||
lgr $fp,$sp
|
la $rp,0($rp,$sp)
|
||||||
aghi $fp,-160-8 # leave room for carry bit
|
la $sp,0($j,$rp) # alloca
|
||||||
la $sp,0($num,$fp) # alloca
|
stg %r0,0($sp) # back chain
|
||||||
stg %r0,0($sp)
|
|
||||||
aghi $fp,160-8 # $fp to point at tp[$num-1]
|
|
||||||
|
|
||||||
la $bp,0($num,$bp) # restore $bp
|
sra $num,3 # restore $num
|
||||||
|
la $bp,0($j,$bp) # restore $bp
|
||||||
|
ahi $num,-1 # adjust $num for inner loop
|
||||||
lg $n0,0($n0) # pull n0
|
lg $n0,0($n0) # pull n0
|
||||||
|
|
||||||
lg $bi,0($bp)
|
lg $bi,0($bp)
|
||||||
lg $alo,0($num,$ap)
|
lg $alo,0($ap)
|
||||||
mlgr $ahi,$bi # ap[0]*bp[0]
|
mlgr $ahi,$bi # ap[0]*bp[0]
|
||||||
lgr $AHI,$ahi
|
lgr $AHI,$ahi
|
||||||
|
|
||||||
lgr $mn0,$alo # "tp[0]"*n0
|
lgr $mn0,$alo # "tp[0]"*n0
|
||||||
msgr $mn0,$n0
|
msgr $mn0,$n0
|
||||||
|
|
||||||
lg $nlo,0($num,$np)#
|
lg $nlo,0($np) #
|
||||||
mlgr $nhi,$mn0 # np[0]*m1
|
mlgr $nhi,$mn0 # np[0]*m1
|
||||||
algr $nlo,$alo # +="tp[0]"
|
algr $nlo,$alo # +="tp[0]"
|
||||||
lghi $NHI,0
|
lghi $NHI,0
|
||||||
alcgr $NHI,$nhi
|
alcgr $NHI,$nhi
|
||||||
|
|
||||||
lgr $j,$num
|
la $j,8(%r0) # j=1
|
||||||
aghi $j,8 # j=1
|
lr $count,$num
|
||||||
|
|
||||||
|
.align 16
|
||||||
.L1st:
|
.L1st:
|
||||||
lg $alo,0($j,$ap)
|
lg $alo,0($j,$ap)
|
||||||
mlgr $ahi,$bi # ap[j]*bp[0]
|
mlgr $ahi,$bi # ap[j]*bp[0]
|
||||||
@ -110,43 +117,45 @@ bn_mul_mont:
|
|||||||
algr $nlo,$alo
|
algr $nlo,$alo
|
||||||
alcgr $NHI,$nhi
|
alcgr $NHI,$nhi
|
||||||
|
|
||||||
stg $nlo,0($j,$fp) # tp[j-1]=
|
stg $nlo,160-8($j,$sp) # tp[j-1]=
|
||||||
aghi $j,8 # j++
|
la $j,8($j) # j++
|
||||||
jnz .L1st
|
brct $count,.L1st
|
||||||
|
|
||||||
algr $NHI,$AHI
|
algr $NHI,$AHI
|
||||||
lghi $AHI,0
|
lghi $AHI,0
|
||||||
alcgr $AHI,$AHI # upmost overflow bit
|
alcgr $AHI,$AHI # upmost overflow bit
|
||||||
stg $NHI,0($fp)
|
stg $NHI,160-8($j,$sp)
|
||||||
stg $AHI,8($fp)
|
stg $AHI,160($j,$sp)
|
||||||
la $bp,8($bp) # bp++
|
la $bp,8($bp) # bp++
|
||||||
|
|
||||||
.Louter:
|
.Louter:
|
||||||
lg $bi,0($bp) # bp[i]
|
lg $bi,0($bp) # bp[i]
|
||||||
lg $alo,0($num,$ap)
|
lg $alo,0($ap)
|
||||||
mlgr $ahi,$bi # ap[0]*bp[i]
|
mlgr $ahi,$bi # ap[0]*bp[i]
|
||||||
alg $alo,8($num,$fp)# +=tp[0]
|
alg $alo,160($sp) # +=tp[0]
|
||||||
lghi $AHI,0
|
lghi $AHI,0
|
||||||
alcgr $AHI,$ahi
|
alcgr $AHI,$ahi
|
||||||
|
|
||||||
lgr $mn0,$alo
|
lgr $mn0,$alo
|
||||||
msgr $mn0,$n0 # tp[0]*n0
|
msgr $mn0,$n0 # tp[0]*n0
|
||||||
|
|
||||||
lg $nlo,0($num,$np)# np[0]
|
lg $nlo,0($np) # np[0]
|
||||||
mlgr $nhi,$mn0 # np[0]*m1
|
mlgr $nhi,$mn0 # np[0]*m1
|
||||||
algr $nlo,$alo # +="tp[0]"
|
algr $nlo,$alo # +="tp[0]"
|
||||||
lghi $NHI,0
|
lghi $NHI,0
|
||||||
alcgr $NHI,$nhi
|
alcgr $NHI,$nhi
|
||||||
|
|
||||||
lgr $j,$num
|
la $j,8(%r0) # j=1
|
||||||
aghi $j,8 # j=1
|
lr $count,$num
|
||||||
|
|
||||||
|
.align 16
|
||||||
.Linner:
|
.Linner:
|
||||||
lg $alo,0($j,$ap)
|
lg $alo,0($j,$ap)
|
||||||
mlgr $ahi,$bi # ap[j]*bp[i]
|
mlgr $ahi,$bi # ap[j]*bp[i]
|
||||||
algr $alo,$AHI
|
algr $alo,$AHI
|
||||||
lghi $AHI,0
|
lghi $AHI,0
|
||||||
alcgr $ahi,$AHI
|
alcgr $ahi,$AHI
|
||||||
alg $alo,8($j,$fp) # +=tp[j]
|
alg $alo,160($j,$sp)# +=tp[j]
|
||||||
alcgr $AHI,$ahi
|
alcgr $AHI,$ahi
|
||||||
|
|
||||||
lg $nlo,0($j,$np)
|
lg $nlo,0($j,$np)
|
||||||
@ -157,34 +166,29 @@ bn_mul_mont:
|
|||||||
algr $nlo,$alo # +="tp[j]"
|
algr $nlo,$alo # +="tp[j]"
|
||||||
alcgr $NHI,$nhi
|
alcgr $NHI,$nhi
|
||||||
|
|
||||||
stg $nlo,0($j,$fp) # tp[j-1]=
|
stg $nlo,160-8($j,$sp) # tp[j-1]=
|
||||||
aghi $j,8 # j++
|
la $j,8($j) # j++
|
||||||
jnz .Linner
|
brct $count,.Linner
|
||||||
|
|
||||||
algr $NHI,$AHI
|
algr $NHI,$AHI
|
||||||
lghi $AHI,0
|
lghi $AHI,0
|
||||||
alcgr $AHI,$AHI
|
alcgr $AHI,$AHI
|
||||||
alg $NHI,8($fp) # accumulate previous upmost overflow bit
|
alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit
|
||||||
lghi $ahi,0
|
lghi $ahi,0
|
||||||
alcgr $AHI,$ahi # new upmost overflow bit
|
alcgr $AHI,$ahi # new upmost overflow bit
|
||||||
stg $NHI,0($fp)
|
stg $NHI,160-8($j,$sp)
|
||||||
stg $AHI,8($fp)
|
stg $AHI,160($j,$sp)
|
||||||
|
|
||||||
la $bp,8($bp) # bp++
|
la $bp,8($bp) # bp++
|
||||||
clg $bp,16+32($fp) # compare to &bp[num]
|
clg $bp,160+8+32($j,$sp) # compare to &bp[num]
|
||||||
jne .Louter
|
jne .Louter
|
||||||
___
|
|
||||||
|
|
||||||
undef $bi;
|
lg $rp,160+8+16($j,$sp) # reincarnate rp
|
||||||
$count=$bp; undef $bp;
|
la $ap,160($sp)
|
||||||
|
ahi $num,1 # restore $num, incidentally clears "borrow"
|
||||||
|
|
||||||
$code.=<<___;
|
la $j,0(%r0)
|
||||||
lg $rp,16+16($fp) # reincarnate rp
|
lr $count,$num
|
||||||
la $ap,8($fp)
|
|
||||||
lgr $j,$num
|
|
||||||
|
|
||||||
lcgr $count,$num
|
|
||||||
sra $count,3 # incidentally clears "borrow"
|
|
||||||
.Lsub: lg $alo,0($j,$ap)
|
.Lsub: lg $alo,0($j,$ap)
|
||||||
slbg $alo,0($j,$np)
|
slbg $alo,0($j,$np)
|
||||||
stg $alo,0($j,$rp)
|
stg $alo,0($j,$rp)
|
||||||
@ -198,15 +202,17 @@ $code.=<<___;
|
|||||||
xgr $np,$AHI
|
xgr $np,$AHI
|
||||||
ngr $np,$rp
|
ngr $np,$rp
|
||||||
ogr $ap,$np # ap=borrow?tp:rp
|
ogr $ap,$np # ap=borrow?tp:rp
|
||||||
lgr $j,$num
|
|
||||||
|
|
||||||
|
la $j,0(%r0)
|
||||||
|
lgr $count,$num
|
||||||
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
|
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
|
||||||
stg $j,8($j,$fp) # zap tp
|
stg $j,160($j,$sp) # zap tp
|
||||||
stg $alo,0($j,$rp)
|
stg $alo,0($j,$rp)
|
||||||
aghi $j,8
|
la $j,8($j)
|
||||||
jnz .Lcopy
|
brct $count,.Lcopy
|
||||||
|
|
||||||
lmg %r6,%r15,16+48($fp)
|
la %r1,160+8+48($j,$sp)
|
||||||
|
lmg %r6,%r15,0(%r1)
|
||||||
lghi %r2,1 # signal "processed"
|
lghi %r2,1 # signal "processed"
|
||||||
br %r14
|
br %r14
|
||||||
.size bn_mul_mont,.-bn_mul_mont
|
.size bn_mul_mont,.-bn_mul_mont
|
||||||
|
83
crypto/s390xcpuid.S
Normal file
83
crypto/s390xcpuid.S
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
.text
|
||||||
|
|
||||||
|
.globl OPENSSL_s390x_facilities
|
||||||
|
.type OPENSSL_s390x_facilities,@function
|
||||||
|
.align 16
|
||||||
|
OPENSSL_s390x_facilities:
|
||||||
|
lghi %r0,0
|
||||||
|
.long 0xb2b0f010 # stfle 16(%r15)
|
||||||
|
lg %r2,16(%r15)
|
||||||
|
br %r14
|
||||||
|
.size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
|
||||||
|
|
||||||
|
.globl OPENSSL_rdtsc
|
||||||
|
.type OPENSSL_rdtsc,@function
|
||||||
|
.align 16
|
||||||
|
OPENSSL_rdtsc:
|
||||||
|
stck 16(%r15)
|
||||||
|
lg %r2,16(%r15)
|
||||||
|
br %r14
|
||||||
|
.size OPENSSL_rdtsc,.-OPENSSL_rdtsc
|
||||||
|
|
||||||
|
.globl OPENSSL_atomic_add
|
||||||
|
.type OPENSSL_atomic_add,@function
|
||||||
|
.align 16
|
||||||
|
OPENSSL_atomic_add:
|
||||||
|
l %r1,0(%r2)
|
||||||
|
.Lspin: lr %r0,%r1
|
||||||
|
ar %r0,%r3
|
||||||
|
cs %r1,%r0,0(%r2)
|
||||||
|
brc 4,.Lspin
|
||||||
|
lgfr %r2,%r0 # OpenSSL expects the new value
|
||||||
|
br %r14
|
||||||
|
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
|
||||||
|
|
||||||
|
.globl OPENSSL_wipe_cpu
|
||||||
|
.type OPENSSL_wipe_cpu,@function
|
||||||
|
.align 16
|
||||||
|
OPENSSL_wipe_cpu:
|
||||||
|
xgr %r0,%r0
|
||||||
|
xgr %r1,%r1
|
||||||
|
lgr %r2,%r15
|
||||||
|
xgr %r3,%r3
|
||||||
|
xgr %r4,%r4
|
||||||
|
lzdr %f0
|
||||||
|
lzdr %f1
|
||||||
|
lzdr %f2
|
||||||
|
lzdr %f3
|
||||||
|
lzdr %f4
|
||||||
|
lzdr %f5
|
||||||
|
lzdr %f6
|
||||||
|
lzdr %f7
|
||||||
|
br %r14
|
||||||
|
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
|
||||||
|
|
||||||
|
.globl OPENSSL_cleanse
|
||||||
|
.type OPENSSL_cleanse,@function
|
||||||
|
.align 16
|
||||||
|
OPENSSL_cleanse:
|
||||||
|
lghi %r4,15
|
||||||
|
lghi %r0,0
|
||||||
|
clgr %r3,%r4
|
||||||
|
jh .Lot
|
||||||
|
.Little:
|
||||||
|
stc %r0,0(%r2)
|
||||||
|
la %r2,1(%r2)
|
||||||
|
brctg %r3,.Little
|
||||||
|
br %r14
|
||||||
|
.align 4
|
||||||
|
.Lot: tmll %r2,7
|
||||||
|
jz .Laligned
|
||||||
|
stc %r0,0(%r2)
|
||||||
|
la %r2,1(%r2)
|
||||||
|
brctg %r3,.Lot
|
||||||
|
.Laligned:
|
||||||
|
srlg %r4,%r3,3
|
||||||
|
.Loop: stg %r0,0(%r2)
|
||||||
|
la %r2,8(%r2)
|
||||||
|
brctg %r4,.Loop
|
||||||
|
lghi %r4,7
|
||||||
|
ngr %r3,%r4
|
||||||
|
jnz .Little
|
||||||
|
br %r14
|
||||||
|
.size OPENSSL_cleanse,.-OPENSSL_cleanse
|
@ -15,14 +15,20 @@
|
|||||||
# twist is that SHA1 hardware support is detected and utilized. In
|
# twist is that SHA1 hardware support is detected and utilized. In
|
||||||
# which case performance can reach further >4.5x for larger chunks.
|
# which case performance can reach further >4.5x for larger chunks.
|
||||||
|
|
||||||
|
# January 2009.
|
||||||
|
#
|
||||||
|
# Optimize Xupdate for amount of memory references and reschedule
|
||||||
|
# instructions to favour dual-issue z10 pipeline. On z10 hardware is
|
||||||
|
# "only" ~2.3x faster than software.
|
||||||
|
|
||||||
$kimdfunc=1; # magic function code for kimd instruction
|
$kimdfunc=1; # magic function code for kimd instruction
|
||||||
|
|
||||||
$output=shift;
|
$output=shift;
|
||||||
open STDOUT,">$output";
|
open STDOUT,">$output";
|
||||||
|
|
||||||
$t0="%r0";
|
$K_00_39="%r0"; $K=$K_00_39;
|
||||||
$t1="%r1";
|
$K_40_79="%r1";
|
||||||
$ctx="%r2";
|
$ctx="%r2"; $prefetch="%r2";
|
||||||
$inp="%r3";
|
$inp="%r3";
|
||||||
$len="%r4";
|
$len="%r4";
|
||||||
|
|
||||||
@ -31,119 +37,107 @@ $B="%r6";
|
|||||||
$C="%r7";
|
$C="%r7";
|
||||||
$D="%r8";
|
$D="%r8";
|
||||||
$E="%r9"; @V=($A,$B,$C,$D,$E);
|
$E="%r9"; @V=($A,$B,$C,$D,$E);
|
||||||
$K_00_19="%r10";
|
$t0="%r10";
|
||||||
$K_20_39="%r11";
|
$t1="%r11";
|
||||||
$K_40_59="%r12";
|
@X=("%r12","%r13","%r14");
|
||||||
$K_60_79="%r13";
|
|
||||||
$Xi="%r14";
|
|
||||||
$sp="%r15";
|
$sp="%r15";
|
||||||
|
|
||||||
$frame=160+16*4;
|
$frame=160+16*4;
|
||||||
|
|
||||||
sub BODY_00_15 {
|
|
||||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
|
||||||
my $xi=($i&1)?$Xi:$t1;
|
|
||||||
|
|
||||||
$code.=<<___ if ($i<16 && !($i&1));
|
|
||||||
lg $Xi,`$i*4`($inp)
|
|
||||||
___
|
|
||||||
$code.=<<___;
|
|
||||||
alr $e,$K_00_19 ### $i
|
|
||||||
rll $t0,$a,5
|
|
||||||
alr $e,$t0
|
|
||||||
lr $t0,$d
|
|
||||||
xr $t0,$c
|
|
||||||
nr $t0,$b
|
|
||||||
xr $t0,$d
|
|
||||||
alr $e,$t0
|
|
||||||
rll $b,$b,30
|
|
||||||
___
|
|
||||||
$code.=<<___ if ($i<16 && !($i&1));
|
|
||||||
srlg $xi,$Xi,32
|
|
||||||
stg $Xi,`160+$i*4`($sp)
|
|
||||||
___
|
|
||||||
$code.=<<___;
|
|
||||||
alr $e,$xi
|
|
||||||
___
|
|
||||||
}
|
|
||||||
|
|
||||||
sub Xupdate {
|
sub Xupdate {
|
||||||
my $i=shift;
|
my $i=shift;
|
||||||
|
|
||||||
|
$code.=<<___ if ($i==15);
|
||||||
|
lg $prefetch,160($sp) ### Xupdate(16) warm-up
|
||||||
|
lr $X[0],$X[2]
|
||||||
|
___
|
||||||
return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
|
return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
|
||||||
$code.=<<___;
|
$code.=<<___ if ($i<16);
|
||||||
lg $Xi,`160+4*($i%16)`($sp) ### Xupdate($i)
|
lg $X[0],`$i*4`($inp) ### Xload($i)
|
||||||
xg $Xi,`160+4*(($i+2)%16)`($sp)
|
rllg $X[1],$X[0],32
|
||||||
xg $Xi,`160+4*(($i+8)%16)`($sp)
|
|
||||||
___
|
___
|
||||||
if ((($i+13)%16)==15) {
|
$code.=<<___ if ($i>=16);
|
||||||
$code.=<<___;
|
xgr $X[0],$prefetch ### Xupdate($i)
|
||||||
llgf $t0,`160+4*15`($sp)
|
lg $prefetch,`160+4*(($i+2)%16)`($sp)
|
||||||
x $Xi,`160+0`($sp)
|
xg $X[0],`160+4*(($i+8)%16)`($sp)
|
||||||
sllg $t0,$t0,32
|
xgr $X[0],$prefetch
|
||||||
xgr $Xi,$t0
|
rll $X[0],$X[0],1
|
||||||
|
rllg $X[1],$X[0],32
|
||||||
|
rll $X[1],$X[1],1
|
||||||
|
rllg $X[0],$X[1],32
|
||||||
|
lr $X[2],$X[1] # feedback
|
||||||
___
|
___
|
||||||
} else {
|
$code.=<<___ if ($i<=70);
|
||||||
$code.=<<___;
|
stg $X[0],`160+4*($i%16)`($sp)
|
||||||
xg $Xi,`160+4*(($i+13)%16)`($sp)
|
|
||||||
___
|
|
||||||
}
|
|
||||||
$code.=<<___;
|
|
||||||
rll $Xi,$Xi,1
|
|
||||||
rllg $t1,$Xi,32
|
|
||||||
rll $t1,$t1,1
|
|
||||||
rllg $Xi,$t1,32
|
|
||||||
stg $Xi,`160+4*($i%16)`($sp)
|
|
||||||
___
|
___
|
||||||
|
unshift(@X,pop(@X));
|
||||||
}
|
}
|
||||||
|
|
||||||
sub BODY_16_19 {
|
sub BODY_00_19 {
|
||||||
&Xupdate(@_[0]);
|
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||||
&BODY_00_15(@_);
|
my $xi=$X[1];
|
||||||
|
|
||||||
|
&Xupdate($i);
|
||||||
|
$code.=<<___;
|
||||||
|
alr $e,$K ### $i
|
||||||
|
rll $t1,$a,5
|
||||||
|
lr $t0,$d
|
||||||
|
xr $t0,$c
|
||||||
|
alr $e,$t1
|
||||||
|
nr $t0,$b
|
||||||
|
alr $e,$xi
|
||||||
|
xr $t0,$d
|
||||||
|
rll $b,$b,30
|
||||||
|
alr $e,$t0
|
||||||
|
___
|
||||||
}
|
}
|
||||||
|
|
||||||
sub BODY_20_39 {
|
sub BODY_20_39 {
|
||||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||||
my $xi=($i&1)?$Xi:$t1;
|
my $xi=$X[1];
|
||||||
my $K_XX_XX=($i<40)?$K_20_39:$K_60_79;
|
|
||||||
|
|
||||||
&Xupdate($i);
|
&Xupdate($i);
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
alr $e,$K_XX_XX ### $i
|
alr $e,$K ### $i
|
||||||
rll $t0,$a,5
|
rll $t1,$a,5
|
||||||
alr $e,$t0
|
|
||||||
lr $t0,$b
|
lr $t0,$b
|
||||||
|
alr $e,$t1
|
||||||
xr $t0,$c
|
xr $t0,$c
|
||||||
xr $t0,$d
|
|
||||||
alr $e,$t0
|
|
||||||
rll $b,$b,30
|
|
||||||
alr $e,$xi
|
alr $e,$xi
|
||||||
|
xr $t0,$d
|
||||||
|
rll $b,$b,30
|
||||||
|
alr $e,$t0
|
||||||
___
|
___
|
||||||
}
|
}
|
||||||
|
|
||||||
sub BODY_40_59 {
|
sub BODY_40_59 {
|
||||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||||
my $xi=($i&1)?$Xi:$t1;
|
my $xi=$X[1];
|
||||||
|
|
||||||
&Xupdate($i);
|
&Xupdate($i);
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
alr $e,$K_40_59 ### $i
|
alr $e,$K ### $i
|
||||||
rll $t0,$a,5
|
rll $t1,$a,5
|
||||||
alr $e,$t0
|
|
||||||
lr $t0,$b
|
lr $t0,$b
|
||||||
|
alr $e,$t1
|
||||||
or $t0,$c
|
or $t0,$c
|
||||||
nr $t0,$d
|
|
||||||
alr $e,$xi
|
|
||||||
lr $t1,$b
|
lr $t1,$b
|
||||||
|
nr $t0,$d
|
||||||
nr $t1,$c
|
nr $t1,$c
|
||||||
|
alr $e,$xi
|
||||||
or $t0,$t1
|
or $t0,$t1
|
||||||
alr $e,$t0
|
|
||||||
rll $b,$b,30
|
rll $b,$b,30
|
||||||
|
alr $e,$t0
|
||||||
___
|
___
|
||||||
}
|
}
|
||||||
|
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
.text
|
.text
|
||||||
|
.align 64
|
||||||
|
.type Ktable,\@object
|
||||||
|
Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
|
||||||
|
.skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
|
||||||
|
.size Ktable,.-Ktable
|
||||||
.globl sha1_block_data_order
|
.globl sha1_block_data_order
|
||||||
.type sha1_block_data_order,\@function
|
.type sha1_block_data_order,\@function
|
||||||
sha1_block_data_order:
|
sha1_block_data_order:
|
||||||
@ -165,37 +159,43 @@ $code.=<<___ if ($kimdfunc);
|
|||||||
.Lsoftware:
|
.Lsoftware:
|
||||||
___
|
___
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
|
lghi %r1,-$frame
|
||||||
|
stg $ctx,16($sp)
|
||||||
stmg %r6,%r15,48($sp)
|
stmg %r6,%r15,48($sp)
|
||||||
lgr %r0,$sp
|
lgr %r0,$sp
|
||||||
aghi $sp,-$frame
|
la $sp,0(%r1,$sp)
|
||||||
stg %r0,0($sp)
|
stg %r0,0($sp)
|
||||||
|
|
||||||
sllg $len,$len,6
|
larl $t0,Ktable
|
||||||
la $len,0($inp,$len)
|
|
||||||
|
|
||||||
llgf $A,0($ctx)
|
llgf $A,0($ctx)
|
||||||
llgf $B,4($ctx)
|
llgf $B,4($ctx)
|
||||||
llgf $C,8($ctx)
|
llgf $C,8($ctx)
|
||||||
llgf $D,12($ctx)
|
llgf $D,12($ctx)
|
||||||
llgf $E,16($ctx)
|
llgf $E,16($ctx)
|
||||||
|
|
||||||
llilh $K_00_19,0x5a82
|
lg $K_00_39,0($t0)
|
||||||
oill $K_00_19,0x7999
|
lg $K_40_79,8($t0)
|
||||||
llilh $K_20_39,0x6ed9
|
|
||||||
oill $K_20_39,0xeba1
|
|
||||||
llilh $K_40_59,0x8f1b
|
|
||||||
oill $K_40_59,0xbcdc
|
|
||||||
llilh $K_60_79,0xca62
|
|
||||||
oill $K_60_79,0xc1d6
|
|
||||||
.Lloop:
|
.Lloop:
|
||||||
|
rllg $K_00_39,$K_00_39,32
|
||||||
|
___
|
||||||
|
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
|
||||||
|
$code.=<<___;
|
||||||
|
rllg $K_00_39,$K_00_39,32
|
||||||
___
|
___
|
||||||
for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
|
|
||||||
for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
|
|
||||||
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||||
|
$code.=<<___; $K=$K_40_79;
|
||||||
|
rllg $K_40_79,$K_40_79,32
|
||||||
|
___
|
||||||
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
|
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
|
||||||
|
$code.=<<___;
|
||||||
|
rllg $K_40_79,$K_40_79,32
|
||||||
|
___
|
||||||
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
|
|
||||||
|
lg $ctx,`$frame+16`($sp)
|
||||||
|
la $inp,64($inp)
|
||||||
al $A,0($ctx)
|
al $A,0($ctx)
|
||||||
al $B,4($ctx)
|
al $B,4($ctx)
|
||||||
al $C,8($ctx)
|
al $C,8($ctx)
|
||||||
@ -206,9 +206,7 @@ $code.=<<___;
|
|||||||
st $C,8($ctx)
|
st $C,8($ctx)
|
||||||
st $D,12($ctx)
|
st $D,12($ctx)
|
||||||
st $E,16($ctx)
|
st $E,16($ctx)
|
||||||
la $inp,64($inp)
|
brct $len,.Lloop
|
||||||
clgr $inp,$len
|
|
||||||
jne .Lloop
|
|
||||||
|
|
||||||
lmg %r6,%r15,`$frame+48`($sp)
|
lmg %r6,%r15,`$frame+48`($sp)
|
||||||
br %r14
|
br %r14
|
||||||
|
@ -20,9 +20,15 @@
|
|||||||
#
|
#
|
||||||
# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
|
# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
|
||||||
|
|
||||||
|
# January 2009.
|
||||||
|
#
|
||||||
|
# Add support for hardware SHA512 and reschedule instructions to
|
||||||
|
# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
|
||||||
|
# than software.
|
||||||
|
|
||||||
$t0="%r0";
|
$t0="%r0";
|
||||||
$t1="%r1";
|
$t1="%r1";
|
||||||
$ctx="%r2";
|
$ctx="%r2"; $t2="%r2";
|
||||||
$inp="%r3";
|
$inp="%r3";
|
||||||
$len="%r4"; # used as index in inner loop
|
$len="%r4"; # used as index in inner loop
|
||||||
|
|
||||||
@ -54,7 +60,7 @@ if ($output =~ /512/) {
|
|||||||
@sigma0=(56,63, 7);
|
@sigma0=(56,63, 7);
|
||||||
@sigma1=( 3,45, 6);
|
@sigma1=( 3,45, 6);
|
||||||
$rounds=80;
|
$rounds=80;
|
||||||
$kimdfunc=0; # 0 means unknown/unsupported/unimplemented
|
$kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled
|
||||||
} else {
|
} else {
|
||||||
$label="256";
|
$label="256";
|
||||||
$SZ=4;
|
$SZ=4;
|
||||||
@ -83,32 +89,32 @@ ___
|
|||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
$ROT $t0,$e,$Sigma1[0]
|
$ROT $t0,$e,$Sigma1[0]
|
||||||
$ROT $t1,$e,$Sigma1[1]
|
$ROT $t1,$e,$Sigma1[1]
|
||||||
|
lgr $t2,$f
|
||||||
xgr $t0,$t1
|
xgr $t0,$t1
|
||||||
$ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
|
$ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
|
||||||
xgr $t0,$t1 # Sigma1(e)
|
xgr $t2,$g
|
||||||
$ST $T1,`160+$SZ*($i%16)`($sp)
|
$ST $T1,`160+$SZ*($i%16)`($sp)
|
||||||
|
xgr $t0,$t1 # Sigma1(e)
|
||||||
|
la $T1,0($T1,$h) # T1+=h
|
||||||
|
ngr $t2,$e
|
||||||
|
lgr $t1,$a
|
||||||
algr $T1,$t0 # T1+=Sigma1(e)
|
algr $T1,$t0 # T1+=Sigma1(e)
|
||||||
algr $T1,$h # T1+=h
|
|
||||||
$ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
|
|
||||||
lgr $t0,$f
|
|
||||||
xgr $t0,$g
|
|
||||||
ngr $t0,$e
|
|
||||||
xgr $t0,$g # Ch(e,f,g)
|
|
||||||
algr $T1,$t0 # T1+=Ch(e,f,g)
|
|
||||||
$ROT $h,$a,$Sigma0[0]
|
$ROT $h,$a,$Sigma0[0]
|
||||||
|
xgr $t2,$g # Ch(e,f,g)
|
||||||
|
$ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
|
||||||
$ROT $t0,$a,$Sigma0[1]
|
$ROT $t0,$a,$Sigma0[1]
|
||||||
|
algr $T1,$t2 # T1+=Ch(e,f,g)
|
||||||
|
ogr $t1,$b
|
||||||
xgr $h,$t0
|
xgr $h,$t0
|
||||||
|
lgr $t2,$a
|
||||||
|
ngr $t1,$c
|
||||||
$ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
|
$ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
|
||||||
xgr $h,$t0 # h=Sigma0(a)
|
xgr $h,$t0 # h=Sigma0(a)
|
||||||
lgr $t0,$a
|
ngr $t2,$b
|
||||||
ogr $t0,$b
|
|
||||||
ngr $t0,$c
|
|
||||||
lgr $t1,$a
|
|
||||||
ngr $t1,$b
|
|
||||||
ogr $t0,$t1 # Maj(a,b,c)
|
|
||||||
algr $h,$t0 # h+=Maj(a,b,c)
|
|
||||||
algr $d,$T1 # d+=T1
|
|
||||||
algr $h,$T1 # h+=T1
|
algr $h,$T1 # h+=T1
|
||||||
|
ogr $t2,$t1 # Maj(a,b,c)
|
||||||
|
la $d,0($d,$T1) # d+=T1
|
||||||
|
algr $h,$t2 # h+=Maj(a,b,c)
|
||||||
___
|
___
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -120,15 +126,15 @@ $code.=<<___;
|
|||||||
$LD $t1,`160+$SZ*(($i+14)%16)`($sp)
|
$LD $t1,`160+$SZ*(($i+14)%16)`($sp)
|
||||||
$ROT $t0,$T1,$sigma0[0]
|
$ROT $t0,$T1,$sigma0[0]
|
||||||
$SHR $T1,$sigma0[2]
|
$SHR $T1,$sigma0[2]
|
||||||
|
$ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
|
||||||
xgr $T1,$t0
|
xgr $T1,$t0
|
||||||
$ROT $t0,$t0,`$sigma0[1]-$sigma0[0]`
|
|
||||||
xgr $T1,$t0 # sigma0(X[i+1])
|
|
||||||
$ROT $t0,$t1,$sigma1[0]
|
$ROT $t0,$t1,$sigma1[0]
|
||||||
$ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i]
|
xgr $T1,$t2 # sigma0(X[i+1])
|
||||||
$SHR $t1,$sigma1[2]
|
$SHR $t1,$sigma1[2]
|
||||||
|
$ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i]
|
||||||
xgr $t1,$t0
|
xgr $t1,$t0
|
||||||
$ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
|
|
||||||
$ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
|
$ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
|
||||||
|
$ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
|
||||||
xgr $t1,$t0 # sigma1(X[i+14])
|
xgr $t1,$t0 # sigma1(X[i+14])
|
||||||
algr $T1,$t1 # +=sigma1(X[i+14])
|
algr $T1,$t1 # +=sigma1(X[i+14])
|
||||||
___
|
___
|
||||||
@ -225,15 +231,14 @@ $code.=<<___ if ($kimdfunc);
|
|||||||
___
|
___
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
sllg $len,$len,`log(16*$SZ)/log(2)`
|
sllg $len,$len,`log(16*$SZ)/log(2)`
|
||||||
la $len,0($inp,$len)
|
lghi %r1,-$frame
|
||||||
stmg $len,%r15,32($sp)
|
agr $len,$inp
|
||||||
|
stmg $ctx,%r15,16($sp)
|
||||||
lgr %r0,$sp
|
lgr %r0,$sp
|
||||||
aghi $sp,-$frame
|
la $sp,0(%r1,$sp)
|
||||||
stg %r0,0($sp)
|
stg %r0,0($sp)
|
||||||
|
|
||||||
bras $tbl,.Lpic
|
larl $tbl,$Table
|
||||||
.Lpic: aghi $tbl,$Table-.Lpic
|
|
||||||
|
|
||||||
$LD $A,`0*$SZ`($ctx)
|
$LD $A,`0*$SZ`($ctx)
|
||||||
$LD $B,`1*$SZ`($ctx)
|
$LD $B,`1*$SZ`($ctx)
|
||||||
$LD $C,`2*$SZ`($ctx)
|
$LD $C,`2*$SZ`($ctx)
|
||||||
@ -255,6 +260,8 @@ $code.=<<___;
|
|||||||
clgr $len,$t0
|
clgr $len,$t0
|
||||||
jne .Lrounds_16_xx
|
jne .Lrounds_16_xx
|
||||||
|
|
||||||
|
lg $ctx,`$frame+16`($sp)
|
||||||
|
la $inp,`16*$SZ`($inp)
|
||||||
$ADD $A,`0*$SZ`($ctx)
|
$ADD $A,`0*$SZ`($ctx)
|
||||||
$ADD $B,`1*$SZ`($ctx)
|
$ADD $B,`1*$SZ`($ctx)
|
||||||
$ADD $C,`2*$SZ`($ctx)
|
$ADD $C,`2*$SZ`($ctx)
|
||||||
@ -271,7 +278,6 @@ $code.=<<___;
|
|||||||
$ST $F,`5*$SZ`($ctx)
|
$ST $F,`5*$SZ`($ctx)
|
||||||
$ST $G,`6*$SZ`($ctx)
|
$ST $G,`6*$SZ`($ctx)
|
||||||
$ST $H,`7*$SZ`($ctx)
|
$ST $H,`7*$SZ`($ctx)
|
||||||
la $inp,`16*$SZ`($inp)
|
|
||||||
clg $inp,`$frame+32`($sp)
|
clg $inp,`$frame+32`($sp)
|
||||||
jne .Lloop
|
jne .Lloop
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user