s390x assembler pack update.

This commit is contained in:
Andy Polyakov 2009-02-09 15:42:04 +00:00
parent c23632d3f1
commit 8626230a02
7 changed files with 804 additions and 449 deletions

View File

@ -131,7 +131,7 @@ my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void";
my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::::::::void"; my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::::::::void";
my $mips3_asm=":bn-mips3.o::::::::::::void"; my $mips3_asm=":bn-mips3.o::::::::::::void";
my $s390x_asm=":bn-s390x.o::aes_cbc.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o:::::::void"; my $s390x_asm="s390xcpuid.o:bn-s390x.o s390x-mont.o::aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o:::::::void";
my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::void"; my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::void";
my $ppc32_asm="ppccpuid.o:bn-ppc.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::"; my $ppc32_asm="ppccpuid.o:bn-ppc.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::";
my $ppc64_asm="ppccpuid.o:bn-ppc.o ppc-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::"; my $ppc64_asm="ppccpuid.o:bn-ppc.o ppc-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::";

6
TABLE
View File

@ -3542,10 +3542,10 @@ $thread_cflag = -D_REENTRANT
$sys_id = $sys_id =
$lflags = -ldl $lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj = $cpuid_obj = s390xcpuid.o
$bn_obj = bn-s390x.o $bn_obj = bn-s390x.o s390x-mont.o
$des_obj = $des_obj =
$aes_obj = aes_cbc.o aes-s390x.o $aes_obj = aes-s390x.o
$bf_obj = $bf_obj =
$md5_obj = $md5_obj =
$sha1_obj = sha1-s390x.o sha256-s390x.o sha512-s390x.o $sha1_obj = sha1-s390x.o sha256-s390x.o sha512-s390x.o

File diff suppressed because it is too large Load Diff

View File

@ -27,6 +27,11 @@
# module performance by implementing dedicated squaring code-path and # module performance by implementing dedicated squaring code-path and
# possibly by unrolling loops... # possibly by unrolling loops...
# January 2009.
#
# Reschedule to minimize/avoid Address Generation Interlock hazard,
# make inner loops counter-based.
$mn0="%r0"; $mn0="%r0";
$num="%r1"; $num="%r1";
@ -47,7 +52,7 @@ $nhi="%r10";
$nlo="%r11"; $nlo="%r11";
$AHI="%r12"; $AHI="%r12";
$NHI="%r13"; $NHI="%r13";
$fp="%r14"; $count="%r14";
$sp="%r15"; $sp="%r15";
$code.=<<___; $code.=<<___;
@ -57,44 +62,46 @@ $code.=<<___;
bn_mul_mont: bn_mul_mont:
lgf $num,164($sp) # pull $num lgf $num,164($sp) # pull $num
sla $num,3 # $num to enumerate bytes sla $num,3 # $num to enumerate bytes
la $rp,0($num,$rp) # pointers to point at the vectors' ends
la $ap,0($num,$ap)
la $bp,0($num,$bp) la $bp,0($num,$bp)
la $np,0($num,$np)
stmg %r2,%r15,16($sp) stmg %r2,%r15,16($sp)
cghi $num,16 # cghi $num,16 #
lghi %r2,0 # lghi %r2,0 #
blr %r14 # if($num<16) return 0; blr %r14 # if($num<16) return 0;
cghi $num,128 #
bhr %r14 # if($num>128) return 0;
lcgr $num,$num # -$num lghi $rp,-160-8 # leave room for carry bit
lcgr $j,$num # -$num
lgr %r0,$sp lgr %r0,$sp
lgr $fp,$sp la $rp,0($rp,$sp)
aghi $fp,-160-8 # leave room for carry bit la $sp,0($j,$rp) # alloca
la $sp,0($num,$fp) # alloca stg %r0,0($sp) # back chain
stg %r0,0($sp)
aghi $fp,160-8 # $fp to point at tp[$num-1]
la $bp,0($num,$bp) # restore $bp sra $num,3 # restore $num
la $bp,0($j,$bp) # restore $bp
ahi $num,-1 # adjust $num for inner loop
lg $n0,0($n0) # pull n0 lg $n0,0($n0) # pull n0
lg $bi,0($bp) lg $bi,0($bp)
lg $alo,0($num,$ap) lg $alo,0($ap)
mlgr $ahi,$bi # ap[0]*bp[0] mlgr $ahi,$bi # ap[0]*bp[0]
lgr $AHI,$ahi lgr $AHI,$ahi
lgr $mn0,$alo # "tp[0]"*n0 lgr $mn0,$alo # "tp[0]"*n0
msgr $mn0,$n0 msgr $mn0,$n0
lg $nlo,0($num,$np)# lg $nlo,0($np) #
mlgr $nhi,$mn0 # np[0]*m1 mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]" algr $nlo,$alo # +="tp[0]"
lghi $NHI,0 lghi $NHI,0
alcgr $NHI,$nhi alcgr $NHI,$nhi
lgr $j,$num la $j,8(%r0) # j=1
aghi $j,8 # j=1 lr $count,$num
.align 16
.L1st: .L1st:
lg $alo,0($j,$ap) lg $alo,0($j,$ap)
mlgr $ahi,$bi # ap[j]*bp[0] mlgr $ahi,$bi # ap[j]*bp[0]
@ -110,43 +117,45 @@ bn_mul_mont:
algr $nlo,$alo algr $nlo,$alo
alcgr $NHI,$nhi alcgr $NHI,$nhi
stg $nlo,0($j,$fp) # tp[j-1]= stg $nlo,160-8($j,$sp) # tp[j-1]=
aghi $j,8 # j++ la $j,8($j) # j++
jnz .L1st brct $count,.L1st
algr $NHI,$AHI algr $NHI,$AHI
lghi $AHI,0 lghi $AHI,0
alcgr $AHI,$AHI # upmost overflow bit alcgr $AHI,$AHI # upmost overflow bit
stg $NHI,0($fp) stg $NHI,160-8($j,$sp)
stg $AHI,8($fp) stg $AHI,160($j,$sp)
la $bp,8($bp) # bp++ la $bp,8($bp) # bp++
.Louter: .Louter:
lg $bi,0($bp) # bp[i] lg $bi,0($bp) # bp[i]
lg $alo,0($num,$ap) lg $alo,0($ap)
mlgr $ahi,$bi # ap[0]*bp[i] mlgr $ahi,$bi # ap[0]*bp[i]
alg $alo,8($num,$fp)# +=tp[0] alg $alo,160($sp) # +=tp[0]
lghi $AHI,0 lghi $AHI,0
alcgr $AHI,$ahi alcgr $AHI,$ahi
lgr $mn0,$alo lgr $mn0,$alo
msgr $mn0,$n0 # tp[0]*n0 msgr $mn0,$n0 # tp[0]*n0
lg $nlo,0($num,$np)# np[0] lg $nlo,0($np) # np[0]
mlgr $nhi,$mn0 # np[0]*m1 mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]" algr $nlo,$alo # +="tp[0]"
lghi $NHI,0 lghi $NHI,0
alcgr $NHI,$nhi alcgr $NHI,$nhi
lgr $j,$num la $j,8(%r0) # j=1
aghi $j,8 # j=1 lr $count,$num
.align 16
.Linner: .Linner:
lg $alo,0($j,$ap) lg $alo,0($j,$ap)
mlgr $ahi,$bi # ap[j]*bp[i] mlgr $ahi,$bi # ap[j]*bp[i]
algr $alo,$AHI algr $alo,$AHI
lghi $AHI,0 lghi $AHI,0
alcgr $ahi,$AHI alcgr $ahi,$AHI
alg $alo,8($j,$fp) # +=tp[j] alg $alo,160($j,$sp)# +=tp[j]
alcgr $AHI,$ahi alcgr $AHI,$ahi
lg $nlo,0($j,$np) lg $nlo,0($j,$np)
@ -157,34 +166,29 @@ bn_mul_mont:
algr $nlo,$alo # +="tp[j]" algr $nlo,$alo # +="tp[j]"
alcgr $NHI,$nhi alcgr $NHI,$nhi
stg $nlo,0($j,$fp) # tp[j-1]= stg $nlo,160-8($j,$sp) # tp[j-1]=
aghi $j,8 # j++ la $j,8($j) # j++
jnz .Linner brct $count,.Linner
algr $NHI,$AHI algr $NHI,$AHI
lghi $AHI,0 lghi $AHI,0
alcgr $AHI,$AHI alcgr $AHI,$AHI
alg $NHI,8($fp) # accumulate previous upmost overflow bit alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit
lghi $ahi,0 lghi $ahi,0
alcgr $AHI,$ahi # new upmost overflow bit alcgr $AHI,$ahi # new upmost overflow bit
stg $NHI,0($fp) stg $NHI,160-8($j,$sp)
stg $AHI,8($fp) stg $AHI,160($j,$sp)
la $bp,8($bp) # bp++ la $bp,8($bp) # bp++
clg $bp,16+32($fp) # compare to &bp[num] clg $bp,160+8+32($j,$sp) # compare to &bp[num]
jne .Louter jne .Louter
___
undef $bi; lg $rp,160+8+16($j,$sp) # reincarnate rp
$count=$bp; undef $bp; la $ap,160($sp)
ahi $num,1 # restore $num, incidentally clears "borrow"
$code.=<<___; la $j,0(%r0)
lg $rp,16+16($fp) # reincarnate rp lr $count,$num
la $ap,8($fp)
lgr $j,$num
lcgr $count,$num
sra $count,3 # incidentally clears "borrow"
.Lsub: lg $alo,0($j,$ap) .Lsub: lg $alo,0($j,$ap)
slbg $alo,0($j,$np) slbg $alo,0($j,$np)
stg $alo,0($j,$rp) stg $alo,0($j,$rp)
@ -198,15 +202,17 @@ $code.=<<___;
xgr $np,$AHI xgr $np,$AHI
ngr $np,$rp ngr $np,$rp
ogr $ap,$np # ap=borrow?tp:rp ogr $ap,$np # ap=borrow?tp:rp
lgr $j,$num
la $j,0(%r0)
lgr $count,$num
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
stg $j,8($j,$fp) # zap tp stg $j,160($j,$sp) # zap tp
stg $alo,0($j,$rp) stg $alo,0($j,$rp)
aghi $j,8 la $j,8($j)
jnz .Lcopy brct $count,.Lcopy
lmg %r6,%r15,16+48($fp) la %r1,160+8+48($j,$sp)
lmg %r6,%r15,0(%r1)
lghi %r2,1 # signal "processed" lghi %r2,1 # signal "processed"
br %r14 br %r14
.size bn_mul_mont,.-bn_mul_mont .size bn_mul_mont,.-bn_mul_mont

83
crypto/s390xcpuid.S Normal file
View File

@ -0,0 +1,83 @@
.text
.globl OPENSSL_s390x_facilities
.type OPENSSL_s390x_facilities,@function
.align 16
OPENSSL_s390x_facilities:
lghi %r0,0
.long 0xb2b0f010 # stfle 16(%r15)
lg %r2,16(%r15)
br %r14
.size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
.globl OPENSSL_rdtsc
.type OPENSSL_rdtsc,@function
.align 16
OPENSSL_rdtsc:
stck 16(%r15)
lg %r2,16(%r15)
br %r14
.size OPENSSL_rdtsc,.-OPENSSL_rdtsc
.globl OPENSSL_atomic_add
.type OPENSSL_atomic_add,@function
.align 16
OPENSSL_atomic_add:
l %r1,0(%r2)
.Lspin: lr %r0,%r1
ar %r0,%r3
cs %r1,%r0,0(%r2)
brc 4,.Lspin
lgfr %r2,%r0 # OpenSSL expects the new value
br %r14
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
.globl OPENSSL_wipe_cpu
.type OPENSSL_wipe_cpu,@function
.align 16
OPENSSL_wipe_cpu:
xgr %r0,%r0
xgr %r1,%r1
lgr %r2,%r15
xgr %r3,%r3
xgr %r4,%r4
lzdr %f0
lzdr %f1
lzdr %f2
lzdr %f3
lzdr %f4
lzdr %f5
lzdr %f6
lzdr %f7
br %r14
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
.globl OPENSSL_cleanse
.type OPENSSL_cleanse,@function
.align 16
OPENSSL_cleanse:
lghi %r4,15
lghi %r0,0
clgr %r3,%r4
jh .Lot
.Little:
stc %r0,0(%r2)
la %r2,1(%r2)
brctg %r3,.Little
br %r14
.align 4
.Lot: tmll %r2,7
jz .Laligned
stc %r0,0(%r2)
la %r2,1(%r2)
brctg %r3,.Lot
.Laligned:
srlg %r4,%r3,3
.Loop: stg %r0,0(%r2)
la %r2,8(%r2)
brctg %r4,.Loop
lghi %r4,7
ngr %r3,%r4
jnz .Little
br %r14
.size OPENSSL_cleanse,.-OPENSSL_cleanse

View File

@ -15,14 +15,20 @@
# twist is that SHA1 hardware support is detected and utilized. In # twist is that SHA1 hardware support is detected and utilized. In
# which case performance can reach further >4.5x for larger chunks. # which case performance can reach further >4.5x for larger chunks.
# January 2009.
#
# Optimize Xupdate for amount of memory references and reschedule
# instructions to favour dual-issue z10 pipeline. On z10 hardware is
# "only" ~2.3x faster than software.
$kimdfunc=1; # magic function code for kimd instruction $kimdfunc=1; # magic function code for kimd instruction
$output=shift; $output=shift;
open STDOUT,">$output"; open STDOUT,">$output";
$t0="%r0"; $K_00_39="%r0"; $K=$K_00_39;
$t1="%r1"; $K_40_79="%r1";
$ctx="%r2"; $ctx="%r2"; $prefetch="%r2";
$inp="%r3"; $inp="%r3";
$len="%r4"; $len="%r4";
@ -31,119 +37,107 @@ $B="%r6";
$C="%r7"; $C="%r7";
$D="%r8"; $D="%r8";
$E="%r9"; @V=($A,$B,$C,$D,$E); $E="%r9"; @V=($A,$B,$C,$D,$E);
$K_00_19="%r10"; $t0="%r10";
$K_20_39="%r11"; $t1="%r11";
$K_40_59="%r12"; @X=("%r12","%r13","%r14");
$K_60_79="%r13";
$Xi="%r14";
$sp="%r15"; $sp="%r15";
$frame=160+16*4; $frame=160+16*4;
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=($i&1)?$Xi:$t1;
$code.=<<___ if ($i<16 && !($i&1));
lg $Xi,`$i*4`($inp)
___
$code.=<<___;
alr $e,$K_00_19 ### $i
rll $t0,$a,5
alr $e,$t0
lr $t0,$d
xr $t0,$c
nr $t0,$b
xr $t0,$d
alr $e,$t0
rll $b,$b,30
___
$code.=<<___ if ($i<16 && !($i&1));
srlg $xi,$Xi,32
stg $Xi,`160+$i*4`($sp)
___
$code.=<<___;
alr $e,$xi
___
}
sub Xupdate { sub Xupdate {
my $i=shift; my $i=shift;
$code.=<<___ if ($i==15);
lg $prefetch,160($sp) ### Xupdate(16) warm-up
lr $X[0],$X[2]
___
return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
$code.=<<___; $code.=<<___ if ($i<16);
lg $Xi,`160+4*($i%16)`($sp) ### Xupdate($i) lg $X[0],`$i*4`($inp) ### Xload($i)
xg $Xi,`160+4*(($i+2)%16)`($sp) rllg $X[1],$X[0],32
xg $Xi,`160+4*(($i+8)%16)`($sp)
___ ___
if ((($i+13)%16)==15) { $code.=<<___ if ($i>=16);
$code.=<<___; xgr $X[0],$prefetch ### Xupdate($i)
llgf $t0,`160+4*15`($sp) lg $prefetch,`160+4*(($i+2)%16)`($sp)
x $Xi,`160+0`($sp) xg $X[0],`160+4*(($i+8)%16)`($sp)
sllg $t0,$t0,32 xgr $X[0],$prefetch
xgr $Xi,$t0 rll $X[0],$X[0],1
rllg $X[1],$X[0],32
rll $X[1],$X[1],1
rllg $X[0],$X[1],32
lr $X[2],$X[1] # feedback
___ ___
} else { $code.=<<___ if ($i<=70);
$code.=<<___; stg $X[0],`160+4*($i%16)`($sp)
xg $Xi,`160+4*(($i+13)%16)`($sp)
___
}
$code.=<<___;
rll $Xi,$Xi,1
rllg $t1,$Xi,32
rll $t1,$t1,1
rllg $Xi,$t1,32
stg $Xi,`160+4*($i%16)`($sp)
___ ___
unshift(@X,pop(@X));
} }
sub BODY_16_19 { sub BODY_00_19 {
&Xupdate(@_[0]); my ($i,$a,$b,$c,$d,$e)=@_;
&BODY_00_15(@_); my $xi=$X[1];
&Xupdate($i);
$code.=<<___;
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$d
xr $t0,$c
alr $e,$t1
nr $t0,$b
alr $e,$xi
xr $t0,$d
rll $b,$b,30
alr $e,$t0
___
} }
sub BODY_20_39 { sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_; my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=($i&1)?$Xi:$t1; my $xi=$X[1];
my $K_XX_XX=($i<40)?$K_20_39:$K_60_79;
&Xupdate($i); &Xupdate($i);
$code.=<<___; $code.=<<___;
alr $e,$K_XX_XX ### $i alr $e,$K ### $i
rll $t0,$a,5 rll $t1,$a,5
alr $e,$t0
lr $t0,$b lr $t0,$b
alr $e,$t1
xr $t0,$c xr $t0,$c
xr $t0,$d
alr $e,$t0
rll $b,$b,30
alr $e,$xi alr $e,$xi
xr $t0,$d
rll $b,$b,30
alr $e,$t0
___ ___
} }
sub BODY_40_59 { sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_; my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=($i&1)?$Xi:$t1; my $xi=$X[1];
&Xupdate($i); &Xupdate($i);
$code.=<<___; $code.=<<___;
alr $e,$K_40_59 ### $i alr $e,$K ### $i
rll $t0,$a,5 rll $t1,$a,5
alr $e,$t0
lr $t0,$b lr $t0,$b
alr $e,$t1
or $t0,$c or $t0,$c
nr $t0,$d
alr $e,$xi
lr $t1,$b lr $t1,$b
nr $t0,$d
nr $t1,$c nr $t1,$c
alr $e,$xi
or $t0,$t1 or $t0,$t1
alr $e,$t0
rll $b,$b,30 rll $b,$b,30
alr $e,$t0
___ ___
} }
$code.=<<___; $code.=<<___;
.text .text
.align 64
.type Ktable,\@object
Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
.skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
.size Ktable,.-Ktable
.globl sha1_block_data_order .globl sha1_block_data_order
.type sha1_block_data_order,\@function .type sha1_block_data_order,\@function
sha1_block_data_order: sha1_block_data_order:
@ -165,37 +159,43 @@ $code.=<<___ if ($kimdfunc);
.Lsoftware: .Lsoftware:
___ ___
$code.=<<___; $code.=<<___;
lghi %r1,-$frame
stg $ctx,16($sp)
stmg %r6,%r15,48($sp) stmg %r6,%r15,48($sp)
lgr %r0,$sp lgr %r0,$sp
aghi $sp,-$frame la $sp,0(%r1,$sp)
stg %r0,0($sp) stg %r0,0($sp)
sllg $len,$len,6 larl $t0,Ktable
la $len,0($inp,$len)
llgf $A,0($ctx) llgf $A,0($ctx)
llgf $B,4($ctx) llgf $B,4($ctx)
llgf $C,8($ctx) llgf $C,8($ctx)
llgf $D,12($ctx) llgf $D,12($ctx)
llgf $E,16($ctx) llgf $E,16($ctx)
llilh $K_00_19,0x5a82 lg $K_00_39,0($t0)
oill $K_00_19,0x7999 lg $K_40_79,8($t0)
llilh $K_20_39,0x6ed9
oill $K_20_39,0xeba1
llilh $K_40_59,0x8f1b
oill $K_40_59,0xbcdc
llilh $K_60_79,0xca62
oill $K_60_79,0xc1d6
.Lloop: .Lloop:
rllg $K_00_39,$K_00_39,32
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
rllg $K_00_39,$K_00_39,32
___ ___
for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; $K=$K_40_79;
rllg $K_40_79,$K_40_79,32
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
rllg $K_40_79,$K_40_79,32
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; $code.=<<___;
lg $ctx,`$frame+16`($sp)
la $inp,64($inp)
al $A,0($ctx) al $A,0($ctx)
al $B,4($ctx) al $B,4($ctx)
al $C,8($ctx) al $C,8($ctx)
@ -206,9 +206,7 @@ $code.=<<___;
st $C,8($ctx) st $C,8($ctx)
st $D,12($ctx) st $D,12($ctx)
st $E,16($ctx) st $E,16($ctx)
la $inp,64($inp) brct $len,.Lloop
clgr $inp,$len
jne .Lloop
lmg %r6,%r15,`$frame+48`($sp) lmg %r6,%r15,`$frame+48`($sp)
br %r14 br %r14

View File

@ -20,9 +20,15 @@
# #
# sha512_block_data_order is ~70% faster than gcc 3.3 generated code. # sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
# January 2009.
#
# Add support for hardware SHA512 and reschedule instructions to
# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
# than software.
$t0="%r0"; $t0="%r0";
$t1="%r1"; $t1="%r1";
$ctx="%r2"; $ctx="%r2"; $t2="%r2";
$inp="%r3"; $inp="%r3";
$len="%r4"; # used as index in inner loop $len="%r4"; # used as index in inner loop
@ -54,7 +60,7 @@ if ($output =~ /512/) {
@sigma0=(56,63, 7); @sigma0=(56,63, 7);
@sigma1=( 3,45, 6); @sigma1=( 3,45, 6);
$rounds=80; $rounds=80;
$kimdfunc=0; # 0 means unknown/unsupported/unimplemented $kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled
} else { } else {
$label="256"; $label="256";
$SZ=4; $SZ=4;
@ -83,32 +89,32 @@ ___
$code.=<<___; $code.=<<___;
$ROT $t0,$e,$Sigma1[0] $ROT $t0,$e,$Sigma1[0]
$ROT $t1,$e,$Sigma1[1] $ROT $t1,$e,$Sigma1[1]
lgr $t2,$f
xgr $t0,$t1 xgr $t0,$t1
$ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]` $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
xgr $t0,$t1 # Sigma1(e) xgr $t2,$g
$ST $T1,`160+$SZ*($i%16)`($sp) $ST $T1,`160+$SZ*($i%16)`($sp)
xgr $t0,$t1 # Sigma1(e)
la $T1,0($T1,$h) # T1+=h
ngr $t2,$e
lgr $t1,$a
algr $T1,$t0 # T1+=Sigma1(e) algr $T1,$t0 # T1+=Sigma1(e)
algr $T1,$h # T1+=h
$ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
lgr $t0,$f
xgr $t0,$g
ngr $t0,$e
xgr $t0,$g # Ch(e,f,g)
algr $T1,$t0 # T1+=Ch(e,f,g)
$ROT $h,$a,$Sigma0[0] $ROT $h,$a,$Sigma0[0]
xgr $t2,$g # Ch(e,f,g)
$ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
$ROT $t0,$a,$Sigma0[1] $ROT $t0,$a,$Sigma0[1]
algr $T1,$t2 # T1+=Ch(e,f,g)
ogr $t1,$b
xgr $h,$t0 xgr $h,$t0
lgr $t2,$a
ngr $t1,$c
$ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]` $ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
xgr $h,$t0 # h=Sigma0(a) xgr $h,$t0 # h=Sigma0(a)
lgr $t0,$a ngr $t2,$b
ogr $t0,$b
ngr $t0,$c
lgr $t1,$a
ngr $t1,$b
ogr $t0,$t1 # Maj(a,b,c)
algr $h,$t0 # h+=Maj(a,b,c)
algr $d,$T1 # d+=T1
algr $h,$T1 # h+=T1 algr $h,$T1 # h+=T1
ogr $t2,$t1 # Maj(a,b,c)
la $d,0($d,$T1) # d+=T1
algr $h,$t2 # h+=Maj(a,b,c)
___ ___
} }
@ -120,15 +126,15 @@ $code.=<<___;
$LD $t1,`160+$SZ*(($i+14)%16)`($sp) $LD $t1,`160+$SZ*(($i+14)%16)`($sp)
$ROT $t0,$T1,$sigma0[0] $ROT $t0,$T1,$sigma0[0]
$SHR $T1,$sigma0[2] $SHR $T1,$sigma0[2]
$ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
xgr $T1,$t0 xgr $T1,$t0
$ROT $t0,$t0,`$sigma0[1]-$sigma0[0]`
xgr $T1,$t0 # sigma0(X[i+1])
$ROT $t0,$t1,$sigma1[0] $ROT $t0,$t1,$sigma1[0]
$ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i] xgr $T1,$t2 # sigma0(X[i+1])
$SHR $t1,$sigma1[2] $SHR $t1,$sigma1[2]
$ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i]
xgr $t1,$t0 xgr $t1,$t0
$ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
$ROT $t0,$t0,`$sigma1[1]-$sigma1[0]` $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
$ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
xgr $t1,$t0 # sigma1(X[i+14]) xgr $t1,$t0 # sigma1(X[i+14])
algr $T1,$t1 # +=sigma1(X[i+14]) algr $T1,$t1 # +=sigma1(X[i+14])
___ ___
@ -225,15 +231,14 @@ $code.=<<___ if ($kimdfunc);
___ ___
$code.=<<___; $code.=<<___;
sllg $len,$len,`log(16*$SZ)/log(2)` sllg $len,$len,`log(16*$SZ)/log(2)`
la $len,0($inp,$len) lghi %r1,-$frame
stmg $len,%r15,32($sp) agr $len,$inp
stmg $ctx,%r15,16($sp)
lgr %r0,$sp lgr %r0,$sp
aghi $sp,-$frame la $sp,0(%r1,$sp)
stg %r0,0($sp) stg %r0,0($sp)
bras $tbl,.Lpic larl $tbl,$Table
.Lpic: aghi $tbl,$Table-.Lpic
$LD $A,`0*$SZ`($ctx) $LD $A,`0*$SZ`($ctx)
$LD $B,`1*$SZ`($ctx) $LD $B,`1*$SZ`($ctx)
$LD $C,`2*$SZ`($ctx) $LD $C,`2*$SZ`($ctx)
@ -255,6 +260,8 @@ $code.=<<___;
clgr $len,$t0 clgr $len,$t0
jne .Lrounds_16_xx jne .Lrounds_16_xx
lg $ctx,`$frame+16`($sp)
la $inp,`16*$SZ`($inp)
$ADD $A,`0*$SZ`($ctx) $ADD $A,`0*$SZ`($ctx)
$ADD $B,`1*$SZ`($ctx) $ADD $B,`1*$SZ`($ctx)
$ADD $C,`2*$SZ`($ctx) $ADD $C,`2*$SZ`($ctx)
@ -271,7 +278,6 @@ $code.=<<___;
$ST $F,`5*$SZ`($ctx) $ST $F,`5*$SZ`($ctx)
$ST $G,`6*$SZ`($ctx) $ST $G,`6*$SZ`($ctx)
$ST $H,`7*$SZ`($ctx) $ST $H,`7*$SZ`($ctx)
la $inp,`16*$SZ`($inp)
clg $inp,`$frame+32`($sp) clg $inp,`$frame+32`($sp)
jne .Lloop jne .Lloop