s390x assembler pack update.

This commit is contained in:
Andy Polyakov 2009-02-09 15:42:04 +00:00
parent c23632d3f1
commit 8626230a02
7 changed files with 804 additions and 449 deletions

View File

@ -131,7 +131,7 @@ my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void";
my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::::::::void";
my $mips3_asm=":bn-mips3.o::::::::::::void";
my $s390x_asm=":bn-s390x.o::aes_cbc.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o:::::::void";
my $s390x_asm="s390xcpuid.o:bn-s390x.o s390x-mont.o::aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o:::::::void";
my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::void";
my $ppc32_asm="ppccpuid.o:bn-ppc.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::";
my $ppc64_asm="ppccpuid.o:bn-ppc.o ppc-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::";

6
TABLE
View File

@ -3542,10 +3542,10 @@ $thread_cflag = -D_REENTRANT
$sys_id =
$lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj =
$bn_obj = bn-s390x.o
$cpuid_obj = s390xcpuid.o
$bn_obj = bn-s390x.o s390x-mont.o
$des_obj =
$aes_obj = aes_cbc.o aes-s390x.o
$aes_obj = aes-s390x.o
$bf_obj =
$md5_obj =
$sha1_obj = sha1-s390x.o sha256-s390x.o sha512-s390x.o

File diff suppressed because it is too large Load Diff

View File

@ -27,6 +27,11 @@
# module performance by implementing dedicated squaring code-path and
# possibly by unrolling loops...
# January 2009.
#
# Reschedule to minimize/avoid Address Generation Interlock hazard,
# make inner loops counter-based.
$mn0="%r0";
$num="%r1";
@ -47,7 +52,7 @@ $nhi="%r10";
$nlo="%r11";
$AHI="%r12";
$NHI="%r13";
$fp="%r14";
$count="%r14";
$sp="%r15";
$code.=<<___;
@ -57,44 +62,46 @@ $code.=<<___;
bn_mul_mont:
lgf $num,164($sp) # pull $num
sla $num,3 # $num to enumerate bytes
la $rp,0($num,$rp) # pointers to point at the vectors' ends
la $ap,0($num,$ap)
la $bp,0($num,$bp)
la $np,0($num,$np)
stmg %r2,%r15,16($sp)
cghi $num,16 #
lghi %r2,0 #
blr %r14 # if($num<16) return 0;
cghi $num,128 #
bhr %r14 # if($num>128) return 0;
lcgr $num,$num # -$num
lghi $rp,-160-8 # leave room for carry bit
lcgr $j,$num # -$num
lgr %r0,$sp
lgr $fp,$sp
aghi $fp,-160-8 # leave room for carry bit
la $sp,0($num,$fp) # alloca
stg %r0,0($sp)
aghi $fp,160-8 # $fp to point at tp[$num-1]
la $rp,0($rp,$sp)
la $sp,0($j,$rp) # alloca
stg %r0,0($sp) # back chain
la $bp,0($num,$bp) # restore $bp
sra $num,3 # restore $num
la $bp,0($j,$bp) # restore $bp
ahi $num,-1 # adjust $num for inner loop
lg $n0,0($n0) # pull n0
lg $bi,0($bp)
lg $alo,0($num,$ap)
lg $alo,0($ap)
mlgr $ahi,$bi # ap[0]*bp[0]
lgr $AHI,$ahi
lgr $mn0,$alo # "tp[0]"*n0
msgr $mn0,$n0
lg $nlo,0($num,$np)#
lg $nlo,0($np) #
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
alcgr $NHI,$nhi
lgr $j,$num
aghi $j,8 # j=1
la $j,8(%r0) # j=1
lr $count,$num
.align 16
.L1st:
lg $alo,0($j,$ap)
mlgr $ahi,$bi # ap[j]*bp[0]
@ -110,43 +117,45 @@ bn_mul_mont:
algr $nlo,$alo
alcgr $NHI,$nhi
stg $nlo,0($j,$fp) # tp[j-1]=
aghi $j,8 # j++
jnz .L1st
stg $nlo,160-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.L1st
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI # upmost overflow bit
stg $NHI,0($fp)
stg $AHI,8($fp)
stg $NHI,160-8($j,$sp)
stg $AHI,160($j,$sp)
la $bp,8($bp) # bp++
.Louter:
lg $bi,0($bp) # bp[i]
lg $alo,0($num,$ap)
lg $alo,0($ap)
mlgr $ahi,$bi # ap[0]*bp[i]
alg $alo,8($num,$fp)# +=tp[0]
alg $alo,160($sp) # +=tp[0]
lghi $AHI,0
alcgr $AHI,$ahi
lgr $mn0,$alo
msgr $mn0,$n0 # tp[0]*n0
msgr $mn0,$n0 # tp[0]*n0
lg $nlo,0($num,$np)# np[0]
lg $nlo,0($np) # np[0]
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
alcgr $NHI,$nhi
lgr $j,$num
aghi $j,8 # j=1
la $j,8(%r0) # j=1
lr $count,$num
.align 16
.Linner:
lg $alo,0($j,$ap)
mlgr $ahi,$bi # ap[j]*bp[i]
algr $alo,$AHI
lghi $AHI,0
alcgr $ahi,$AHI
alg $alo,8($j,$fp) # +=tp[j]
alg $alo,160($j,$sp)# +=tp[j]
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
@ -157,34 +166,29 @@ bn_mul_mont:
algr $nlo,$alo # +="tp[j]"
alcgr $NHI,$nhi
stg $nlo,0($j,$fp) # tp[j-1]=
aghi $j,8 # j++
jnz .Linner
stg $nlo,160-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.Linner
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI
alg $NHI,8($fp) # accumulate previous upmost overflow bit
alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit
lghi $ahi,0
alcgr $AHI,$ahi # new upmost overflow bit
stg $NHI,0($fp)
stg $AHI,8($fp)
stg $NHI,160-8($j,$sp)
stg $AHI,160($j,$sp)
la $bp,8($bp) # bp++
clg $bp,16+32($fp) # compare to &bp[num]
clg $bp,160+8+32($j,$sp) # compare to &bp[num]
jne .Louter
___
undef $bi;
$count=$bp; undef $bp;
lg $rp,160+8+16($j,$sp) # reincarnate rp
la $ap,160($sp)
ahi $num,1 # restore $num, incidentally clears "borrow"
$code.=<<___;
lg $rp,16+16($fp) # reincarnate rp
la $ap,8($fp)
lgr $j,$num
lcgr $count,$num
sra $count,3 # incidentally clears "borrow"
la $j,0(%r0)
lr $count,$num
.Lsub: lg $alo,0($j,$ap)
slbg $alo,0($j,$np)
stg $alo,0($j,$rp)
@ -198,15 +202,17 @@ $code.=<<___;
xgr $np,$AHI
ngr $np,$rp
ogr $ap,$np # ap=borrow?tp:rp
lgr $j,$num
la $j,0(%r0)
lgr $count,$num
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
stg $j,8($j,$fp) # zap tp
stg $j,160($j,$sp) # zap tp
stg $alo,0($j,$rp)
aghi $j,8
jnz .Lcopy
la $j,8($j)
brct $count,.Lcopy
lmg %r6,%r15,16+48($fp)
la %r1,160+8+48($j,$sp)
lmg %r6,%r15,0(%r1)
lghi %r2,1 # signal "processed"
br %r14
.size bn_mul_mont,.-bn_mul_mont

83
crypto/s390xcpuid.S Normal file
View File

@ -0,0 +1,83 @@
.text
.globl OPENSSL_s390x_facilities
.type OPENSSL_s390x_facilities,@function
.align 16
OPENSSL_s390x_facilities:
lghi %r0,0
.long 0xb2b0f010 # stfle 16(%r15)
lg %r2,16(%r15)
br %r14
.size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
.globl OPENSSL_rdtsc
.type OPENSSL_rdtsc,@function
.align 16
OPENSSL_rdtsc:
stck 16(%r15)
lg %r2,16(%r15)
br %r14
.size OPENSSL_rdtsc,.-OPENSSL_rdtsc
.globl OPENSSL_atomic_add
.type OPENSSL_atomic_add,@function
.align 16
OPENSSL_atomic_add:
l %r1,0(%r2)
.Lspin: lr %r0,%r1
ar %r0,%r3
cs %r1,%r0,0(%r2)
brc 4,.Lspin
lgfr %r2,%r0 # OpenSSL expects the new value
br %r14
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
.globl OPENSSL_wipe_cpu
.type OPENSSL_wipe_cpu,@function
.align 16
OPENSSL_wipe_cpu:
xgr %r0,%r0
xgr %r1,%r1
lgr %r2,%r15
xgr %r3,%r3
xgr %r4,%r4
lzdr %f0
lzdr %f1
lzdr %f2
lzdr %f3
lzdr %f4
lzdr %f5
lzdr %f6
lzdr %f7
br %r14
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
.globl OPENSSL_cleanse
.type OPENSSL_cleanse,@function
.align 16
OPENSSL_cleanse:
lghi %r4,15
lghi %r0,0
clgr %r3,%r4
jh .Lot
.Little:
stc %r0,0(%r2)
la %r2,1(%r2)
brctg %r3,.Little
br %r14
.align 4
.Lot: tmll %r2,7
jz .Laligned
stc %r0,0(%r2)
la %r2,1(%r2)
brctg %r3,.Lot
.Laligned:
srlg %r4,%r3,3
.Loop: stg %r0,0(%r2)
la %r2,8(%r2)
brctg %r4,.Loop
lghi %r4,7
ngr %r3,%r4
jnz .Little
br %r14
.size OPENSSL_cleanse,.-OPENSSL_cleanse

View File

@ -15,14 +15,20 @@
# twist is that SHA1 hardware support is detected and utilized. In
# which case performance can reach further >4.5x for larger chunks.
# January 2009.
#
# Optimize Xupdate for amount of memory references and reschedule
# instructions to favour dual-issue z10 pipeline. On z10 hardware is
# "only" ~2.3x faster than software.
$kimdfunc=1; # magic function code for kimd instruction
$output=shift;
open STDOUT,">$output";
$t0="%r0";
$t1="%r1";
$ctx="%r2";
$K_00_39="%r0"; $K=$K_00_39;
$K_40_79="%r1";
$ctx="%r2"; $prefetch="%r2";
$inp="%r3";
$len="%r4";
@ -31,119 +37,107 @@ $B="%r6";
$C="%r7";
$D="%r8";
$E="%r9"; @V=($A,$B,$C,$D,$E);
$K_00_19="%r10";
$K_20_39="%r11";
$K_40_59="%r12";
$K_60_79="%r13";
$Xi="%r14";
$t0="%r10";
$t1="%r11";
@X=("%r12","%r13","%r14");
$sp="%r15";
$frame=160+16*4;
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=($i&1)?$Xi:$t1;
$code.=<<___ if ($i<16 && !($i&1));
lg $Xi,`$i*4`($inp)
___
$code.=<<___;
alr $e,$K_00_19 ### $i
rll $t0,$a,5
alr $e,$t0
lr $t0,$d
xr $t0,$c
nr $t0,$b
xr $t0,$d
alr $e,$t0
rll $b,$b,30
___
$code.=<<___ if ($i<16 && !($i&1));
srlg $xi,$Xi,32
stg $Xi,`160+$i*4`($sp)
___
$code.=<<___;
alr $e,$xi
___
}
sub Xupdate {
my $i=shift;
$code.=<<___ if ($i==15);
lg $prefetch,160($sp) ### Xupdate(16) warm-up
lr $X[0],$X[2]
___
return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
$code.=<<___;
lg $Xi,`160+4*($i%16)`($sp) ### Xupdate($i)
xg $Xi,`160+4*(($i+2)%16)`($sp)
xg $Xi,`160+4*(($i+8)%16)`($sp)
$code.=<<___ if ($i<16);
lg $X[0],`$i*4`($inp) ### Xload($i)
rllg $X[1],$X[0],32
___
if ((($i+13)%16)==15) {
$code.=<<___;
llgf $t0,`160+4*15`($sp)
x $Xi,`160+0`($sp)
sllg $t0,$t0,32
xgr $Xi,$t0
$code.=<<___ if ($i>=16);
xgr $X[0],$prefetch ### Xupdate($i)
lg $prefetch,`160+4*(($i+2)%16)`($sp)
xg $X[0],`160+4*(($i+8)%16)`($sp)
xgr $X[0],$prefetch
rll $X[0],$X[0],1
rllg $X[1],$X[0],32
rll $X[1],$X[1],1
rllg $X[0],$X[1],32
lr $X[2],$X[1] # feedback
___
} else {
$code.=<<___;
xg $Xi,`160+4*(($i+13)%16)`($sp)
___
}
$code.=<<___;
rll $Xi,$Xi,1
rllg $t1,$Xi,32
rll $t1,$t1,1
rllg $Xi,$t1,32
stg $Xi,`160+4*($i%16)`($sp)
$code.=<<___ if ($i<=70);
stg $X[0],`160+4*($i%16)`($sp)
___
unshift(@X,pop(@X));
}
sub BODY_16_19 {
&Xupdate(@_[0]);
&BODY_00_15(@_);
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=$X[1];
&Xupdate($i);
$code.=<<___;
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$d
xr $t0,$c
alr $e,$t1
nr $t0,$b
alr $e,$xi
xr $t0,$d
rll $b,$b,30
alr $e,$t0
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=($i&1)?$Xi:$t1;
my $K_XX_XX=($i<40)?$K_20_39:$K_60_79;
my $xi=$X[1];
&Xupdate($i);
$code.=<<___;
alr $e,$K_XX_XX ### $i
rll $t0,$a,5
alr $e,$t0
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$b
alr $e,$t1
xr $t0,$c
xr $t0,$d
alr $e,$t0
rll $b,$b,30
alr $e,$xi
xr $t0,$d
rll $b,$b,30
alr $e,$t0
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=($i&1)?$Xi:$t1;
my $xi=$X[1];
&Xupdate($i);
$code.=<<___;
alr $e,$K_40_59 ### $i
rll $t0,$a,5
alr $e,$t0
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$b
alr $e,$t1
or $t0,$c
nr $t0,$d
alr $e,$xi
lr $t1,$b
nr $t0,$d
nr $t1,$c
alr $e,$xi
or $t0,$t1
alr $e,$t0
rll $b,$b,30
alr $e,$t0
___
}
$code.=<<___;
.text
.align 64
.type Ktable,\@object
Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
.skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
.size Ktable,.-Ktable
.globl sha1_block_data_order
.type sha1_block_data_order,\@function
sha1_block_data_order:
@ -165,37 +159,43 @@ $code.=<<___ if ($kimdfunc);
.Lsoftware:
___
$code.=<<___;
lghi %r1,-$frame
stg $ctx,16($sp)
stmg %r6,%r15,48($sp)
lgr %r0,$sp
aghi $sp,-$frame
la $sp,0(%r1,$sp)
stg %r0,0($sp)
sllg $len,$len,6
la $len,0($inp,$len)
larl $t0,Ktable
llgf $A,0($ctx)
llgf $B,4($ctx)
llgf $C,8($ctx)
llgf $D,12($ctx)
llgf $E,16($ctx)
llilh $K_00_19,0x5a82
oill $K_00_19,0x7999
llilh $K_20_39,0x6ed9
oill $K_20_39,0xeba1
llilh $K_40_59,0x8f1b
oill $K_40_59,0xbcdc
llilh $K_60_79,0xca62
oill $K_60_79,0xc1d6
lg $K_00_39,0($t0)
lg $K_40_79,8($t0)
.Lloop:
rllg $K_00_39,$K_00_39,32
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
rllg $K_00_39,$K_00_39,32
___
for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; $K=$K_40_79;
rllg $K_40_79,$K_40_79,32
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
rllg $K_40_79,$K_40_79,32
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
lg $ctx,`$frame+16`($sp)
la $inp,64($inp)
al $A,0($ctx)
al $B,4($ctx)
al $C,8($ctx)
@ -206,9 +206,7 @@ $code.=<<___;
st $C,8($ctx)
st $D,12($ctx)
st $E,16($ctx)
la $inp,64($inp)
clgr $inp,$len
jne .Lloop
brct $len,.Lloop
lmg %r6,%r15,`$frame+48`($sp)
br %r14

View File

@ -20,9 +20,15 @@
#
# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
# January 2009.
#
# Add support for hardware SHA512 and reschedule instructions to
# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
# than software.
$t0="%r0";
$t1="%r1";
$ctx="%r2";
$ctx="%r2"; $t2="%r2";
$inp="%r3";
$len="%r4"; # used as index in inner loop
@ -54,7 +60,7 @@ if ($output =~ /512/) {
@sigma0=(56,63, 7);
@sigma1=( 3,45, 6);
$rounds=80;
$kimdfunc=0; # 0 means unknown/unsupported/unimplemented
$kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled
} else {
$label="256";
$SZ=4;
@ -83,32 +89,32 @@ ___
$code.=<<___;
$ROT $t0,$e,$Sigma1[0]
$ROT $t1,$e,$Sigma1[1]
lgr $t2,$f
xgr $t0,$t1
$ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
xgr $t0,$t1 # Sigma1(e)
xgr $t2,$g
$ST $T1,`160+$SZ*($i%16)`($sp)
xgr $t0,$t1 # Sigma1(e)
la $T1,0($T1,$h) # T1+=h
ngr $t2,$e
lgr $t1,$a
algr $T1,$t0 # T1+=Sigma1(e)
algr $T1,$h # T1+=h
$ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
lgr $t0,$f
xgr $t0,$g
ngr $t0,$e
xgr $t0,$g # Ch(e,f,g)
algr $T1,$t0 # T1+=Ch(e,f,g)
$ROT $h,$a,$Sigma0[0]
xgr $t2,$g # Ch(e,f,g)
$ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
$ROT $t0,$a,$Sigma0[1]
algr $T1,$t2 # T1+=Ch(e,f,g)
ogr $t1,$b
xgr $h,$t0
lgr $t2,$a
ngr $t1,$c
$ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
xgr $h,$t0 # h=Sigma0(a)
lgr $t0,$a
ogr $t0,$b
ngr $t0,$c
lgr $t1,$a
ngr $t1,$b
ogr $t0,$t1 # Maj(a,b,c)
algr $h,$t0 # h+=Maj(a,b,c)
algr $d,$T1 # d+=T1
ngr $t2,$b
algr $h,$T1 # h+=T1
ogr $t2,$t1 # Maj(a,b,c)
la $d,0($d,$T1) # d+=T1
algr $h,$t2 # h+=Maj(a,b,c)
___
}
@ -120,15 +126,15 @@ $code.=<<___;
$LD $t1,`160+$SZ*(($i+14)%16)`($sp)
$ROT $t0,$T1,$sigma0[0]
$SHR $T1,$sigma0[2]
$ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
xgr $T1,$t0
$ROT $t0,$t0,`$sigma0[1]-$sigma0[0]`
xgr $T1,$t0 # sigma0(X[i+1])
$ROT $t0,$t1,$sigma1[0]
$ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i]
xgr $T1,$t2 # sigma0(X[i+1])
$SHR $t1,$sigma1[2]
$ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i]
xgr $t1,$t0
$ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
$ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
$ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
xgr $t1,$t0 # sigma1(X[i+14])
algr $T1,$t1 # +=sigma1(X[i+14])
___
@ -225,15 +231,14 @@ $code.=<<___ if ($kimdfunc);
___
$code.=<<___;
sllg $len,$len,`log(16*$SZ)/log(2)`
la $len,0($inp,$len)
stmg $len,%r15,32($sp)
lghi %r1,-$frame
agr $len,$inp
stmg $ctx,%r15,16($sp)
lgr %r0,$sp
aghi $sp,-$frame
la $sp,0(%r1,$sp)
stg %r0,0($sp)
bras $tbl,.Lpic
.Lpic: aghi $tbl,$Table-.Lpic
larl $tbl,$Table
$LD $A,`0*$SZ`($ctx)
$LD $B,`1*$SZ`($ctx)
$LD $C,`2*$SZ`($ctx)
@ -255,6 +260,8 @@ $code.=<<___;
clgr $len,$t0
jne .Lrounds_16_xx
lg $ctx,`$frame+16`($sp)
la $inp,`16*$SZ`($inp)
$ADD $A,`0*$SZ`($ctx)
$ADD $B,`1*$SZ`($ctx)
$ADD $C,`2*$SZ`($ctx)
@ -271,7 +278,6 @@ $code.=<<___;
$ST $F,`5*$SZ`($ctx)
$ST $G,`6*$SZ`($ctx)
$ST $H,`7*$SZ`($ctx)
la $inp,`16*$SZ`($inp)
clg $inp,`$frame+32`($sp)
jne .Lloop