rc4-s390x.pl: allow for older assembler and optimize character loop.

This commit is contained in:
Andy Polyakov 2009-02-12 14:48:49 +00:00
parent 13c3a1defa
commit c558c99fd8

View File

@ -9,9 +9,9 @@
# #
# February 2009 # February 2009
# #
# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to avoid # Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to
# arithmetic instructions, but adhere to load and load address in # "cluster" Address Generation Interlocks, so that one pipeline stall
# order to minimize Address Generation Interlock. # resolves several dependencies.
$rp="%r14"; $rp="%r14";
$sp="%r15"; $sp="%r15";
@ -43,10 +43,10 @@ RC4:
llgc $XX[0],0($key) llgc $XX[0],0($key)
llgc $YY,1($key) llgc $YY,1($key)
la $XX[0],1($XX[0]) la $XX[0],1($XX[0])
llgcr $XX[0],$XX[0] nill $XX[0],0xff
llgc $TX[0],2($XX[0],$key)
srlg $cnt,$len,3 srlg $cnt,$len,3
ltgr $cnt,$cnt ltgr $cnt,$cnt
llgc $TX[0],2($XX[0],$key)
jz .Lshort jz .Lshort
j .Loop8 j .Loop8
@ -56,17 +56,17 @@ ___
for ($i=0;$i<8;$i++) { for ($i=0;$i<8;$i++) {
$code.=<<___; $code.=<<___;
la $YY,0($YY,$TX[0]) # $i la $YY,0($YY,$TX[0]) # $i
llgcr $YY,$YY nill $YY,255
la $XX[1],1($XX[0]) la $XX[1],1($XX[0])
llgcr $XX[1],$XX[1] nill $XX[1],255
___
$code.=<<___ if ($i==1);
llgc $acc,2($TY,$key)
___ ___
$code.=<<___ if ($i>1); $code.=<<___ if ($i>1);
sllg $acc,$acc,8 sllg $acc,$acc,8
ic $acc,2($TY,$key) ic $acc,2($TY,$key)
___ ___
$code.=<<___ if ($i==1);
llgc $acc,2($TY,$key)
___
$code.=<<___; $code.=<<___;
llgc $TY,2($YY,$key) llgc $TY,2($YY,$key)
stc $TX[0],2($YY,$key) stc $TX[0],2($YY,$key)
@ -77,7 +77,7 @@ $code.=<<___;
la $TX[1],0($TX[0]) la $TX[1],0($TX[0])
.Lcmov$i: .Lcmov$i:
la $TY,0($TY,$TX[0]) la $TY,0($TY,$TX[0])
llgcr $TY,$TY nill $TY,255
___ ___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
} }
@ -101,18 +101,18 @@ $code.=<<___;
.align 16 .align 16
.Loop1: .Loop1:
la $YY,0($YY,$TX[0]) la $YY,0($YY,$TX[0])
llgcr $YY,$YY nill $YY,255
llgc $TY,2($YY,$key) llgc $TY,2($YY,$key)
stc $TX[0],2($YY,$key) stc $TX[0],2($YY,$key)
stc $TY,2($XX[0],$key) stc $TY,2($XX[0],$key)
la $TY,0($TY,$TX[0]) ar $TY,$TX[0]
llgcr $TY,$TY ahi $XX[0],1
la $XX[0],1($XX[0]) nill $TY,255
llgcr $XX[0],$XX[0] nill $XX[0],255
llgc $TY,2($TY,$key)
llgc $TX[0],2($XX[0],$key)
llgc $acc,0($inp) llgc $acc,0($inp)
la $inp,1($inp) la $inp,1($inp)
llgc $TY,2($TY,$key)
llgc $TX[0],2($XX[0],$key)
xr $acc,$TY xr $acc,$TY
stc $acc,0($out) stc $acc,0($out)
la $out,1($out) la $out,1($out)
@ -168,8 +168,8 @@ RC4_set_key:
la $idx,0($idx,$acc) la $idx,0($idx,$acc)
la $ikey,1($ikey) la $ikey,1($ikey)
la $idx,0($idx,$dat) la $idx,0($idx,$dat)
nill $idx,255
la $iinp,1($iinp) la $iinp,1($iinp)
llgcr $idx,$idx
tml $ikey,255 tml $ikey,255
llgc $dat,2($idx,$key) llgc $dat,2($idx,$key)
stc $dat,2+256-1($ikey,$key) stc $dat,2+256-1($ikey,$key)