s390x assembler pack update.
This commit is contained in:
parent
c23632d3f1
commit
8626230a02
@ -131,7 +131,7 @@ my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-
|
||||
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void";
|
||||
my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::::::::void";
|
||||
my $mips3_asm=":bn-mips3.o::::::::::::void";
|
||||
my $s390x_asm=":bn-s390x.o::aes_cbc.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o:::::::void";
|
||||
my $s390x_asm="s390xcpuid.o:bn-s390x.o s390x-mont.o::aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o:::::::void";
|
||||
my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::void";
|
||||
my $ppc32_asm="ppccpuid.o:bn-ppc.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::";
|
||||
my $ppc64_asm="ppccpuid.o:bn-ppc.o ppc-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::";
|
||||
|
6
TABLE
6
TABLE
@ -3542,10 +3542,10 @@ $thread_cflag = -D_REENTRANT
|
||||
$sys_id =
|
||||
$lflags = -ldl
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
|
||||
$cpuid_obj =
|
||||
$bn_obj = bn-s390x.o
|
||||
$cpuid_obj = s390xcpuid.o
|
||||
$bn_obj = bn-s390x.o s390x-mont.o
|
||||
$des_obj =
|
||||
$aes_obj = aes_cbc.o aes-s390x.o
|
||||
$aes_obj = aes-s390x.o
|
||||
$bf_obj =
|
||||
$md5_obj =
|
||||
$sha1_obj = sha1-s390x.o sha256-s390x.o sha512-s390x.o
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -27,6 +27,11 @@
|
||||
# module performance by implementing dedicated squaring code-path and
|
||||
# possibly by unrolling loops...
|
||||
|
||||
# January 2009.
|
||||
#
|
||||
# Reschedule to minimize/avoid Address Generation Interlock hazard,
|
||||
# make inner loops counter-based.
|
||||
|
||||
$mn0="%r0";
|
||||
$num="%r1";
|
||||
|
||||
@ -47,7 +52,7 @@ $nhi="%r10";
|
||||
$nlo="%r11";
|
||||
$AHI="%r12";
|
||||
$NHI="%r13";
|
||||
$fp="%r14";
|
||||
$count="%r14";
|
||||
$sp="%r15";
|
||||
|
||||
$code.=<<___;
|
||||
@ -57,44 +62,46 @@ $code.=<<___;
|
||||
bn_mul_mont:
|
||||
lgf $num,164($sp) # pull $num
|
||||
sla $num,3 # $num to enumerate bytes
|
||||
la $rp,0($num,$rp) # pointers to point at the vectors' ends
|
||||
la $ap,0($num,$ap)
|
||||
la $bp,0($num,$bp)
|
||||
la $np,0($num,$np)
|
||||
|
||||
stmg %r2,%r15,16($sp)
|
||||
|
||||
cghi $num,16 #
|
||||
lghi %r2,0 #
|
||||
blr %r14 # if($num<16) return 0;
|
||||
cghi $num,128 #
|
||||
bhr %r14 # if($num>128) return 0;
|
||||
|
||||
lcgr $num,$num # -$num
|
||||
lghi $rp,-160-8 # leave room for carry bit
|
||||
lcgr $j,$num # -$num
|
||||
lgr %r0,$sp
|
||||
lgr $fp,$sp
|
||||
aghi $fp,-160-8 # leave room for carry bit
|
||||
la $sp,0($num,$fp) # alloca
|
||||
stg %r0,0($sp)
|
||||
aghi $fp,160-8 # $fp to point at tp[$num-1]
|
||||
la $rp,0($rp,$sp)
|
||||
la $sp,0($j,$rp) # alloca
|
||||
stg %r0,0($sp) # back chain
|
||||
|
||||
la $bp,0($num,$bp) # restore $bp
|
||||
sra $num,3 # restore $num
|
||||
la $bp,0($j,$bp) # restore $bp
|
||||
ahi $num,-1 # adjust $num for inner loop
|
||||
lg $n0,0($n0) # pull n0
|
||||
|
||||
lg $bi,0($bp)
|
||||
lg $alo,0($num,$ap)
|
||||
lg $alo,0($ap)
|
||||
mlgr $ahi,$bi # ap[0]*bp[0]
|
||||
lgr $AHI,$ahi
|
||||
|
||||
lgr $mn0,$alo # "tp[0]"*n0
|
||||
msgr $mn0,$n0
|
||||
|
||||
lg $nlo,0($num,$np)#
|
||||
lg $nlo,0($np) #
|
||||
mlgr $nhi,$mn0 # np[0]*m1
|
||||
algr $nlo,$alo # +="tp[0]"
|
||||
lghi $NHI,0
|
||||
alcgr $NHI,$nhi
|
||||
|
||||
lgr $j,$num
|
||||
aghi $j,8 # j=1
|
||||
la $j,8(%r0) # j=1
|
||||
lr $count,$num
|
||||
|
||||
.align 16
|
||||
.L1st:
|
||||
lg $alo,0($j,$ap)
|
||||
mlgr $ahi,$bi # ap[j]*bp[0]
|
||||
@ -110,43 +117,45 @@ bn_mul_mont:
|
||||
algr $nlo,$alo
|
||||
alcgr $NHI,$nhi
|
||||
|
||||
stg $nlo,0($j,$fp) # tp[j-1]=
|
||||
aghi $j,8 # j++
|
||||
jnz .L1st
|
||||
stg $nlo,160-8($j,$sp) # tp[j-1]=
|
||||
la $j,8($j) # j++
|
||||
brct $count,.L1st
|
||||
|
||||
algr $NHI,$AHI
|
||||
lghi $AHI,0
|
||||
alcgr $AHI,$AHI # upmost overflow bit
|
||||
stg $NHI,0($fp)
|
||||
stg $AHI,8($fp)
|
||||
stg $NHI,160-8($j,$sp)
|
||||
stg $AHI,160($j,$sp)
|
||||
la $bp,8($bp) # bp++
|
||||
|
||||
.Louter:
|
||||
lg $bi,0($bp) # bp[i]
|
||||
lg $alo,0($num,$ap)
|
||||
lg $alo,0($ap)
|
||||
mlgr $ahi,$bi # ap[0]*bp[i]
|
||||
alg $alo,8($num,$fp)# +=tp[0]
|
||||
alg $alo,160($sp) # +=tp[0]
|
||||
lghi $AHI,0
|
||||
alcgr $AHI,$ahi
|
||||
|
||||
lgr $mn0,$alo
|
||||
msgr $mn0,$n0 # tp[0]*n0
|
||||
msgr $mn0,$n0 # tp[0]*n0
|
||||
|
||||
lg $nlo,0($num,$np)# np[0]
|
||||
lg $nlo,0($np) # np[0]
|
||||
mlgr $nhi,$mn0 # np[0]*m1
|
||||
algr $nlo,$alo # +="tp[0]"
|
||||
lghi $NHI,0
|
||||
alcgr $NHI,$nhi
|
||||
|
||||
lgr $j,$num
|
||||
aghi $j,8 # j=1
|
||||
la $j,8(%r0) # j=1
|
||||
lr $count,$num
|
||||
|
||||
.align 16
|
||||
.Linner:
|
||||
lg $alo,0($j,$ap)
|
||||
mlgr $ahi,$bi # ap[j]*bp[i]
|
||||
algr $alo,$AHI
|
||||
lghi $AHI,0
|
||||
alcgr $ahi,$AHI
|
||||
alg $alo,8($j,$fp) # +=tp[j]
|
||||
alg $alo,160($j,$sp)# +=tp[j]
|
||||
alcgr $AHI,$ahi
|
||||
|
||||
lg $nlo,0($j,$np)
|
||||
@ -157,34 +166,29 @@ bn_mul_mont:
|
||||
algr $nlo,$alo # +="tp[j]"
|
||||
alcgr $NHI,$nhi
|
||||
|
||||
stg $nlo,0($j,$fp) # tp[j-1]=
|
||||
aghi $j,8 # j++
|
||||
jnz .Linner
|
||||
stg $nlo,160-8($j,$sp) # tp[j-1]=
|
||||
la $j,8($j) # j++
|
||||
brct $count,.Linner
|
||||
|
||||
algr $NHI,$AHI
|
||||
lghi $AHI,0
|
||||
alcgr $AHI,$AHI
|
||||
alg $NHI,8($fp) # accumulate previous upmost overflow bit
|
||||
alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit
|
||||
lghi $ahi,0
|
||||
alcgr $AHI,$ahi # new upmost overflow bit
|
||||
stg $NHI,0($fp)
|
||||
stg $AHI,8($fp)
|
||||
stg $NHI,160-8($j,$sp)
|
||||
stg $AHI,160($j,$sp)
|
||||
|
||||
la $bp,8($bp) # bp++
|
||||
clg $bp,16+32($fp) # compare to &bp[num]
|
||||
clg $bp,160+8+32($j,$sp) # compare to &bp[num]
|
||||
jne .Louter
|
||||
___
|
||||
|
||||
undef $bi;
|
||||
$count=$bp; undef $bp;
|
||||
lg $rp,160+8+16($j,$sp) # reincarnate rp
|
||||
la $ap,160($sp)
|
||||
ahi $num,1 # restore $num, incidentally clears "borrow"
|
||||
|
||||
$code.=<<___;
|
||||
lg $rp,16+16($fp) # reincarnate rp
|
||||
la $ap,8($fp)
|
||||
lgr $j,$num
|
||||
|
||||
lcgr $count,$num
|
||||
sra $count,3 # incidentally clears "borrow"
|
||||
la $j,0(%r0)
|
||||
lr $count,$num
|
||||
.Lsub: lg $alo,0($j,$ap)
|
||||
slbg $alo,0($j,$np)
|
||||
stg $alo,0($j,$rp)
|
||||
@ -198,15 +202,17 @@ $code.=<<___;
|
||||
xgr $np,$AHI
|
||||
ngr $np,$rp
|
||||
ogr $ap,$np # ap=borrow?tp:rp
|
||||
lgr $j,$num
|
||||
|
||||
la $j,0(%r0)
|
||||
lgr $count,$num
|
||||
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
|
||||
stg $j,8($j,$fp) # zap tp
|
||||
stg $j,160($j,$sp) # zap tp
|
||||
stg $alo,0($j,$rp)
|
||||
aghi $j,8
|
||||
jnz .Lcopy
|
||||
la $j,8($j)
|
||||
brct $count,.Lcopy
|
||||
|
||||
lmg %r6,%r15,16+48($fp)
|
||||
la %r1,160+8+48($j,$sp)
|
||||
lmg %r6,%r15,0(%r1)
|
||||
lghi %r2,1 # signal "processed"
|
||||
br %r14
|
||||
.size bn_mul_mont,.-bn_mul_mont
|
||||
|
83
crypto/s390xcpuid.S
Normal file
83
crypto/s390xcpuid.S
Normal file
@ -0,0 +1,83 @@
|
||||
.text
|
||||
|
||||
.globl OPENSSL_s390x_facilities
|
||||
.type OPENSSL_s390x_facilities,@function
|
||||
.align 16
|
||||
OPENSSL_s390x_facilities:
|
||||
lghi %r0,0
|
||||
.long 0xb2b0f010 # stfle 16(%r15)
|
||||
lg %r2,16(%r15)
|
||||
br %r14
|
||||
.size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
|
||||
|
||||
.globl OPENSSL_rdtsc
|
||||
.type OPENSSL_rdtsc,@function
|
||||
.align 16
|
||||
OPENSSL_rdtsc:
|
||||
stck 16(%r15)
|
||||
lg %r2,16(%r15)
|
||||
br %r14
|
||||
.size OPENSSL_rdtsc,.-OPENSSL_rdtsc
|
||||
|
||||
.globl OPENSSL_atomic_add
|
||||
.type OPENSSL_atomic_add,@function
|
||||
.align 16
|
||||
OPENSSL_atomic_add:
|
||||
l %r1,0(%r2)
|
||||
.Lspin: lr %r0,%r1
|
||||
ar %r0,%r3
|
||||
cs %r1,%r0,0(%r2)
|
||||
brc 4,.Lspin
|
||||
lgfr %r2,%r0 # OpenSSL expects the new value
|
||||
br %r14
|
||||
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
|
||||
|
||||
.globl OPENSSL_wipe_cpu
|
||||
.type OPENSSL_wipe_cpu,@function
|
||||
.align 16
|
||||
OPENSSL_wipe_cpu:
|
||||
xgr %r0,%r0
|
||||
xgr %r1,%r1
|
||||
lgr %r2,%r15
|
||||
xgr %r3,%r3
|
||||
xgr %r4,%r4
|
||||
lzdr %f0
|
||||
lzdr %f1
|
||||
lzdr %f2
|
||||
lzdr %f3
|
||||
lzdr %f4
|
||||
lzdr %f5
|
||||
lzdr %f6
|
||||
lzdr %f7
|
||||
br %r14
|
||||
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
|
||||
|
||||
.globl OPENSSL_cleanse
|
||||
.type OPENSSL_cleanse,@function
|
||||
.align 16
|
||||
OPENSSL_cleanse:
|
||||
lghi %r4,15
|
||||
lghi %r0,0
|
||||
clgr %r3,%r4
|
||||
jh .Lot
|
||||
.Little:
|
||||
stc %r0,0(%r2)
|
||||
la %r2,1(%r2)
|
||||
brctg %r3,.Little
|
||||
br %r14
|
||||
.align 4
|
||||
.Lot: tmll %r2,7
|
||||
jz .Laligned
|
||||
stc %r0,0(%r2)
|
||||
la %r2,1(%r2)
|
||||
brctg %r3,.Lot
|
||||
.Laligned:
|
||||
srlg %r4,%r3,3
|
||||
.Loop: stg %r0,0(%r2)
|
||||
la %r2,8(%r2)
|
||||
brctg %r4,.Loop
|
||||
lghi %r4,7
|
||||
ngr %r3,%r4
|
||||
jnz .Little
|
||||
br %r14
|
||||
.size OPENSSL_cleanse,.-OPENSSL_cleanse
|
@ -15,14 +15,20 @@
|
||||
# twist is that SHA1 hardware support is detected and utilized. In
|
||||
# which case performance can reach further >4.5x for larger chunks.
|
||||
|
||||
# January 2009.
|
||||
#
|
||||
# Optimize Xupdate for amount of memory references and reschedule
|
||||
# instructions to favour dual-issue z10 pipeline. On z10 hardware is
|
||||
# "only" ~2.3x faster than software.
|
||||
|
||||
$kimdfunc=1; # magic function code for kimd instruction
|
||||
|
||||
$output=shift;
|
||||
open STDOUT,">$output";
|
||||
|
||||
$t0="%r0";
|
||||
$t1="%r1";
|
||||
$ctx="%r2";
|
||||
$K_00_39="%r0"; $K=$K_00_39;
|
||||
$K_40_79="%r1";
|
||||
$ctx="%r2"; $prefetch="%r2";
|
||||
$inp="%r3";
|
||||
$len="%r4";
|
||||
|
||||
@ -31,119 +37,107 @@ $B="%r6";
|
||||
$C="%r7";
|
||||
$D="%r8";
|
||||
$E="%r9"; @V=($A,$B,$C,$D,$E);
|
||||
$K_00_19="%r10";
|
||||
$K_20_39="%r11";
|
||||
$K_40_59="%r12";
|
||||
$K_60_79="%r13";
|
||||
$Xi="%r14";
|
||||
$t0="%r10";
|
||||
$t1="%r11";
|
||||
@X=("%r12","%r13","%r14");
|
||||
$sp="%r15";
|
||||
|
||||
$frame=160+16*4;
|
||||
|
||||
sub BODY_00_15 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $xi=($i&1)?$Xi:$t1;
|
||||
|
||||
$code.=<<___ if ($i<16 && !($i&1));
|
||||
lg $Xi,`$i*4`($inp)
|
||||
___
|
||||
$code.=<<___;
|
||||
alr $e,$K_00_19 ### $i
|
||||
rll $t0,$a,5
|
||||
alr $e,$t0
|
||||
lr $t0,$d
|
||||
xr $t0,$c
|
||||
nr $t0,$b
|
||||
xr $t0,$d
|
||||
alr $e,$t0
|
||||
rll $b,$b,30
|
||||
___
|
||||
$code.=<<___ if ($i<16 && !($i&1));
|
||||
srlg $xi,$Xi,32
|
||||
stg $Xi,`160+$i*4`($sp)
|
||||
___
|
||||
$code.=<<___;
|
||||
alr $e,$xi
|
||||
___
|
||||
}
|
||||
|
||||
sub Xupdate {
|
||||
my $i=shift;
|
||||
|
||||
$code.=<<___ if ($i==15);
|
||||
lg $prefetch,160($sp) ### Xupdate(16) warm-up
|
||||
lr $X[0],$X[2]
|
||||
___
|
||||
return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
|
||||
$code.=<<___;
|
||||
lg $Xi,`160+4*($i%16)`($sp) ### Xupdate($i)
|
||||
xg $Xi,`160+4*(($i+2)%16)`($sp)
|
||||
xg $Xi,`160+4*(($i+8)%16)`($sp)
|
||||
$code.=<<___ if ($i<16);
|
||||
lg $X[0],`$i*4`($inp) ### Xload($i)
|
||||
rllg $X[1],$X[0],32
|
||||
___
|
||||
if ((($i+13)%16)==15) {
|
||||
$code.=<<___;
|
||||
llgf $t0,`160+4*15`($sp)
|
||||
x $Xi,`160+0`($sp)
|
||||
sllg $t0,$t0,32
|
||||
xgr $Xi,$t0
|
||||
$code.=<<___ if ($i>=16);
|
||||
xgr $X[0],$prefetch ### Xupdate($i)
|
||||
lg $prefetch,`160+4*(($i+2)%16)`($sp)
|
||||
xg $X[0],`160+4*(($i+8)%16)`($sp)
|
||||
xgr $X[0],$prefetch
|
||||
rll $X[0],$X[0],1
|
||||
rllg $X[1],$X[0],32
|
||||
rll $X[1],$X[1],1
|
||||
rllg $X[0],$X[1],32
|
||||
lr $X[2],$X[1] # feedback
|
||||
___
|
||||
} else {
|
||||
$code.=<<___;
|
||||
xg $Xi,`160+4*(($i+13)%16)`($sp)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
rll $Xi,$Xi,1
|
||||
rllg $t1,$Xi,32
|
||||
rll $t1,$t1,1
|
||||
rllg $Xi,$t1,32
|
||||
stg $Xi,`160+4*($i%16)`($sp)
|
||||
$code.=<<___ if ($i<=70);
|
||||
stg $X[0],`160+4*($i%16)`($sp)
|
||||
___
|
||||
unshift(@X,pop(@X));
|
||||
}
|
||||
|
||||
sub BODY_16_19 {
|
||||
&Xupdate(@_[0]);
|
||||
&BODY_00_15(@_);
|
||||
sub BODY_00_19 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $xi=$X[1];
|
||||
|
||||
&Xupdate($i);
|
||||
$code.=<<___;
|
||||
alr $e,$K ### $i
|
||||
rll $t1,$a,5
|
||||
lr $t0,$d
|
||||
xr $t0,$c
|
||||
alr $e,$t1
|
||||
nr $t0,$b
|
||||
alr $e,$xi
|
||||
xr $t0,$d
|
||||
rll $b,$b,30
|
||||
alr $e,$t0
|
||||
___
|
||||
}
|
||||
|
||||
sub BODY_20_39 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $xi=($i&1)?$Xi:$t1;
|
||||
my $K_XX_XX=($i<40)?$K_20_39:$K_60_79;
|
||||
my $xi=$X[1];
|
||||
|
||||
&Xupdate($i);
|
||||
$code.=<<___;
|
||||
alr $e,$K_XX_XX ### $i
|
||||
rll $t0,$a,5
|
||||
alr $e,$t0
|
||||
alr $e,$K ### $i
|
||||
rll $t1,$a,5
|
||||
lr $t0,$b
|
||||
alr $e,$t1
|
||||
xr $t0,$c
|
||||
xr $t0,$d
|
||||
alr $e,$t0
|
||||
rll $b,$b,30
|
||||
alr $e,$xi
|
||||
xr $t0,$d
|
||||
rll $b,$b,30
|
||||
alr $e,$t0
|
||||
___
|
||||
}
|
||||
|
||||
sub BODY_40_59 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $xi=($i&1)?$Xi:$t1;
|
||||
my $xi=$X[1];
|
||||
|
||||
&Xupdate($i);
|
||||
$code.=<<___;
|
||||
alr $e,$K_40_59 ### $i
|
||||
rll $t0,$a,5
|
||||
alr $e,$t0
|
||||
alr $e,$K ### $i
|
||||
rll $t1,$a,5
|
||||
lr $t0,$b
|
||||
alr $e,$t1
|
||||
or $t0,$c
|
||||
nr $t0,$d
|
||||
alr $e,$xi
|
||||
lr $t1,$b
|
||||
nr $t0,$d
|
||||
nr $t1,$c
|
||||
alr $e,$xi
|
||||
or $t0,$t1
|
||||
alr $e,$t0
|
||||
rll $b,$b,30
|
||||
alr $e,$t0
|
||||
___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
.align 64
|
||||
.type Ktable,\@object
|
||||
Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
|
||||
.skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
|
||||
.size Ktable,.-Ktable
|
||||
.globl sha1_block_data_order
|
||||
.type sha1_block_data_order,\@function
|
||||
sha1_block_data_order:
|
||||
@ -165,37 +159,43 @@ $code.=<<___ if ($kimdfunc);
|
||||
.Lsoftware:
|
||||
___
|
||||
$code.=<<___;
|
||||
lghi %r1,-$frame
|
||||
stg $ctx,16($sp)
|
||||
stmg %r6,%r15,48($sp)
|
||||
lgr %r0,$sp
|
||||
aghi $sp,-$frame
|
||||
la $sp,0(%r1,$sp)
|
||||
stg %r0,0($sp)
|
||||
|
||||
sllg $len,$len,6
|
||||
la $len,0($inp,$len)
|
||||
|
||||
larl $t0,Ktable
|
||||
llgf $A,0($ctx)
|
||||
llgf $B,4($ctx)
|
||||
llgf $C,8($ctx)
|
||||
llgf $D,12($ctx)
|
||||
llgf $E,16($ctx)
|
||||
|
||||
llilh $K_00_19,0x5a82
|
||||
oill $K_00_19,0x7999
|
||||
llilh $K_20_39,0x6ed9
|
||||
oill $K_20_39,0xeba1
|
||||
llilh $K_40_59,0x8f1b
|
||||
oill $K_40_59,0xbcdc
|
||||
llilh $K_60_79,0xca62
|
||||
oill $K_60_79,0xc1d6
|
||||
lg $K_00_39,0($t0)
|
||||
lg $K_40_79,8($t0)
|
||||
|
||||
.Lloop:
|
||||
rllg $K_00_39,$K_00_39,32
|
||||
___
|
||||
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
rllg $K_00_39,$K_00_39,32
|
||||
___
|
||||
for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
|
||||
for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
|
||||
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___; $K=$K_40_79;
|
||||
rllg $K_40_79,$K_40_79,32
|
||||
___
|
||||
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
rllg $K_40_79,$K_40_79,32
|
||||
___
|
||||
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
|
||||
lg $ctx,`$frame+16`($sp)
|
||||
la $inp,64($inp)
|
||||
al $A,0($ctx)
|
||||
al $B,4($ctx)
|
||||
al $C,8($ctx)
|
||||
@ -206,9 +206,7 @@ $code.=<<___;
|
||||
st $C,8($ctx)
|
||||
st $D,12($ctx)
|
||||
st $E,16($ctx)
|
||||
la $inp,64($inp)
|
||||
clgr $inp,$len
|
||||
jne .Lloop
|
||||
brct $len,.Lloop
|
||||
|
||||
lmg %r6,%r15,`$frame+48`($sp)
|
||||
br %r14
|
||||
|
@ -20,9 +20,15 @@
|
||||
#
|
||||
# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
|
||||
|
||||
# January 2009.
|
||||
#
|
||||
# Add support for hardware SHA512 and reschedule instructions to
|
||||
# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
|
||||
# than software.
|
||||
|
||||
$t0="%r0";
|
||||
$t1="%r1";
|
||||
$ctx="%r2";
|
||||
$ctx="%r2"; $t2="%r2";
|
||||
$inp="%r3";
|
||||
$len="%r4"; # used as index in inner loop
|
||||
|
||||
@ -54,7 +60,7 @@ if ($output =~ /512/) {
|
||||
@sigma0=(56,63, 7);
|
||||
@sigma1=( 3,45, 6);
|
||||
$rounds=80;
|
||||
$kimdfunc=0; # 0 means unknown/unsupported/unimplemented
|
||||
$kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled
|
||||
} else {
|
||||
$label="256";
|
||||
$SZ=4;
|
||||
@ -83,32 +89,32 @@ ___
|
||||
$code.=<<___;
|
||||
$ROT $t0,$e,$Sigma1[0]
|
||||
$ROT $t1,$e,$Sigma1[1]
|
||||
lgr $t2,$f
|
||||
xgr $t0,$t1
|
||||
$ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
|
||||
xgr $t0,$t1 # Sigma1(e)
|
||||
xgr $t2,$g
|
||||
$ST $T1,`160+$SZ*($i%16)`($sp)
|
||||
xgr $t0,$t1 # Sigma1(e)
|
||||
la $T1,0($T1,$h) # T1+=h
|
||||
ngr $t2,$e
|
||||
lgr $t1,$a
|
||||
algr $T1,$t0 # T1+=Sigma1(e)
|
||||
algr $T1,$h # T1+=h
|
||||
$ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
|
||||
lgr $t0,$f
|
||||
xgr $t0,$g
|
||||
ngr $t0,$e
|
||||
xgr $t0,$g # Ch(e,f,g)
|
||||
algr $T1,$t0 # T1+=Ch(e,f,g)
|
||||
$ROT $h,$a,$Sigma0[0]
|
||||
xgr $t2,$g # Ch(e,f,g)
|
||||
$ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
|
||||
$ROT $t0,$a,$Sigma0[1]
|
||||
algr $T1,$t2 # T1+=Ch(e,f,g)
|
||||
ogr $t1,$b
|
||||
xgr $h,$t0
|
||||
lgr $t2,$a
|
||||
ngr $t1,$c
|
||||
$ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
|
||||
xgr $h,$t0 # h=Sigma0(a)
|
||||
lgr $t0,$a
|
||||
ogr $t0,$b
|
||||
ngr $t0,$c
|
||||
lgr $t1,$a
|
||||
ngr $t1,$b
|
||||
ogr $t0,$t1 # Maj(a,b,c)
|
||||
algr $h,$t0 # h+=Maj(a,b,c)
|
||||
algr $d,$T1 # d+=T1
|
||||
ngr $t2,$b
|
||||
algr $h,$T1 # h+=T1
|
||||
ogr $t2,$t1 # Maj(a,b,c)
|
||||
la $d,0($d,$T1) # d+=T1
|
||||
algr $h,$t2 # h+=Maj(a,b,c)
|
||||
___
|
||||
}
|
||||
|
||||
@ -120,15 +126,15 @@ $code.=<<___;
|
||||
$LD $t1,`160+$SZ*(($i+14)%16)`($sp)
|
||||
$ROT $t0,$T1,$sigma0[0]
|
||||
$SHR $T1,$sigma0[2]
|
||||
$ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
|
||||
xgr $T1,$t0
|
||||
$ROT $t0,$t0,`$sigma0[1]-$sigma0[0]`
|
||||
xgr $T1,$t0 # sigma0(X[i+1])
|
||||
$ROT $t0,$t1,$sigma1[0]
|
||||
$ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i]
|
||||
xgr $T1,$t2 # sigma0(X[i+1])
|
||||
$SHR $t1,$sigma1[2]
|
||||
$ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i]
|
||||
xgr $t1,$t0
|
||||
$ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
|
||||
$ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
|
||||
$ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
|
||||
xgr $t1,$t0 # sigma1(X[i+14])
|
||||
algr $T1,$t1 # +=sigma1(X[i+14])
|
||||
___
|
||||
@ -225,15 +231,14 @@ $code.=<<___ if ($kimdfunc);
|
||||
___
|
||||
$code.=<<___;
|
||||
sllg $len,$len,`log(16*$SZ)/log(2)`
|
||||
la $len,0($inp,$len)
|
||||
stmg $len,%r15,32($sp)
|
||||
lghi %r1,-$frame
|
||||
agr $len,$inp
|
||||
stmg $ctx,%r15,16($sp)
|
||||
lgr %r0,$sp
|
||||
aghi $sp,-$frame
|
||||
la $sp,0(%r1,$sp)
|
||||
stg %r0,0($sp)
|
||||
|
||||
bras $tbl,.Lpic
|
||||
.Lpic: aghi $tbl,$Table-.Lpic
|
||||
|
||||
larl $tbl,$Table
|
||||
$LD $A,`0*$SZ`($ctx)
|
||||
$LD $B,`1*$SZ`($ctx)
|
||||
$LD $C,`2*$SZ`($ctx)
|
||||
@ -255,6 +260,8 @@ $code.=<<___;
|
||||
clgr $len,$t0
|
||||
jne .Lrounds_16_xx
|
||||
|
||||
lg $ctx,`$frame+16`($sp)
|
||||
la $inp,`16*$SZ`($inp)
|
||||
$ADD $A,`0*$SZ`($ctx)
|
||||
$ADD $B,`1*$SZ`($ctx)
|
||||
$ADD $C,`2*$SZ`($ctx)
|
||||
@ -271,7 +278,6 @@ $code.=<<___;
|
||||
$ST $F,`5*$SZ`($ctx)
|
||||
$ST $G,`6*$SZ`($ctx)
|
||||
$ST $H,`7*$SZ`($ctx)
|
||||
la $inp,`16*$SZ`($inp)
|
||||
clg $inp,`$frame+32`($sp)
|
||||
jne .Lloop
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user