SHA1 assembler show off: minor performance updates and new modules for
forgotten CPUs.
This commit is contained in:
parent
53f73afc4d
commit
5727f1f790
314
crypto/sha/asm/sha1-alpha.pl
Normal file
314
crypto/sha/asm/sha1-alpha.pl
Normal file
@ -0,0 +1,314 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# SHA1 block procedure for Alpha.
|
||||
|
||||
# On 21264 performance is 33% better than code generated by vendor
|
||||
# compiler, and 75% better than GCC [3.4]. Implementation features
|
||||
# vectorized byte swap, but not Xupdate.
|
||||
|
||||
@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7",
|
||||
"\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
|
||||
$ctx="a0"; # $16
|
||||
$inp="a1";
|
||||
$num="a2";
|
||||
$A="a3";
|
||||
$B="a4"; # 20
|
||||
$C="a5";
|
||||
$D="t8";
|
||||
$E="t9"; @V=($A,$B,$C,$D,$E);
|
||||
$t0="t10"; # 24
|
||||
$t1="t11";
|
||||
$t2="ra";
|
||||
$t3="t12";
|
||||
$K="AT"; # 28
|
||||
|
||||
sub BODY_00_19 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
$code.=<<___ if ($i==0);
|
||||
ldq_u @X[0],0+0($inp)
|
||||
ldq_u @X[1],0+7($inp)
|
||||
___
|
||||
$code.=<<___ if (!($i&1) && $i<14);
|
||||
ldq_u @X[$i+2],($i+2)*4+0($inp)
|
||||
ldq_u @X[$i+3],($i+2)*4+7($inp)
|
||||
___
|
||||
$code.=<<___ if (!($i&1) && $i<15);
|
||||
extql @X[$i],$inp,@X[$i]
|
||||
extqh @X[$i+1],$inp,@X[$i+1]
|
||||
|
||||
or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched
|
||||
|
||||
srl @X[$i],24,$t0 # vectorized byte swap
|
||||
srl @X[$i],8,$t2
|
||||
|
||||
sll @X[$i],8,$t3
|
||||
sll @X[$i],24,@X[$i]
|
||||
zapnot $t0,0x11,$t0
|
||||
zapnot $t2,0x22,$t2
|
||||
|
||||
zapnot @X[$i],0x88,@X[$i]
|
||||
or $t0,$t2,$t0
|
||||
zapnot $t3,0x44,$t3
|
||||
sll $a,5,$t1
|
||||
|
||||
or @X[$i],$t0,@X[$i]
|
||||
addl $K,$e,$e
|
||||
and $b,$c,$t2
|
||||
zapnot $a,0xf,$a
|
||||
|
||||
or @X[$i],$t3,@X[$i]
|
||||
srl $a,27,$t0
|
||||
bic $d,$b,$t3
|
||||
sll $b,30,$b
|
||||
|
||||
extll @X[$i],4,@X[$i+1] # extract upper half
|
||||
or $t2,$t3,$t2
|
||||
addl @X[$i],$e,$e
|
||||
|
||||
addl $t1,$e,$e
|
||||
srl $b,32,$t3
|
||||
zapnot @X[$i],0xf,@X[$i]
|
||||
|
||||
addl $t0,$e,$e
|
||||
addl $t2,$e,$e
|
||||
or $t3,$b,$b
|
||||
___
|
||||
$code.=<<___ if (($i&1) && $i<15);
|
||||
sll $a,5,$t1
|
||||
addl $K,$e,$e
|
||||
and $b,$c,$t2
|
||||
zapnot $a,0xf,$a
|
||||
|
||||
srl $a,27,$t0
|
||||
addl @X[$i%16],$e,$e
|
||||
bic $d,$b,$t3
|
||||
sll $b,30,$b
|
||||
|
||||
or $t2,$t3,$t2
|
||||
addl $t1,$e,$e
|
||||
srl $b,32,$t3
|
||||
zapnot @X[$i],0xf,@X[$i]
|
||||
|
||||
addl $t0,$e,$e
|
||||
addl $t2,$e,$e
|
||||
or $t3,$b,$b
|
||||
___
|
||||
$code.=<<___ if ($i>=15); # with forward Xupdate
|
||||
sll $a,5,$t1
|
||||
addl $K,$e,$e
|
||||
and $b,$c,$t2
|
||||
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
|
||||
|
||||
zapnot $a,0xf,$a
|
||||
addl @X[$i%16],$e,$e
|
||||
bic $d,$b,$t3
|
||||
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
|
||||
|
||||
srl $a,27,$t0
|
||||
addl $t1,$e,$e
|
||||
or $t2,$t3,$t2
|
||||
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
|
||||
|
||||
sll $b,30,$b
|
||||
addl $t0,$e,$e
|
||||
srl @X[$j%16],31,$t1
|
||||
|
||||
addl $t2,$e,$e
|
||||
srl $b,32,$t3
|
||||
addl @X[$j%16],@X[$j%16],@X[$j%16]
|
||||
|
||||
or $t3,$b,$b
|
||||
zapnot @X[$i%16],0xf,@X[$i%16]
|
||||
or $t1,@X[$j%16],@X[$j%16]
|
||||
___
|
||||
}
|
||||
|
||||
sub BODY_20_39 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
$code.=<<___ if ($i<79); # with forward Xupdate
|
||||
sll $a,5,$t1
|
||||
addl $K,$e,$e
|
||||
zapnot $a,0xf,$a
|
||||
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
|
||||
|
||||
sll $b,30,$t3
|
||||
addl $t1,$e,$e
|
||||
xor $b,$c,$t2
|
||||
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
|
||||
|
||||
srl $b,2,$b
|
||||
addl @X[$i%16],$e,$e
|
||||
xor $d,$t2,$t2
|
||||
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
|
||||
|
||||
srl @X[$j%16],31,$t1
|
||||
addl $t2,$e,$e
|
||||
srl $a,27,$t0
|
||||
addl @X[$j%16],@X[$j%16],@X[$j%16]
|
||||
|
||||
or $t3,$b,$b
|
||||
addl $t0,$e,$e
|
||||
or $t1,@X[$j%16],@X[$j%16]
|
||||
___
|
||||
$code.=<<___ if ($i<77);
|
||||
zapnot @X[$i%16],0xf,@X[$i%16]
|
||||
___
|
||||
$code.=<<___ if ($i==79); # with context fetch
|
||||
sll $a,5,$t1
|
||||
addl $K,$e,$e
|
||||
zapnot $a,0xf,$a
|
||||
ldl @X[0],0($ctx)
|
||||
|
||||
sll $b,30,$t3
|
||||
addl $t1,$e,$e
|
||||
xor $b,$c,$t2
|
||||
ldl @X[1],4($ctx)
|
||||
|
||||
srl $b,2,$b
|
||||
addl @X[$i%16],$e,$e
|
||||
xor $d,$t2,$t2
|
||||
ldl @X[2],8($ctx)
|
||||
|
||||
srl $a,27,$t0
|
||||
addl $t2,$e,$e
|
||||
ldl @X[3],12($ctx)
|
||||
|
||||
or $t3,$b,$b
|
||||
addl $t0,$e,$e
|
||||
ldl @X[4],16($ctx)
|
||||
___
|
||||
}
|
||||
|
||||
sub BODY_40_59 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
$code.=<<___; # with forward Xupdate
|
||||
sll $a,5,$t1
|
||||
addl $K,$e,$e
|
||||
zapnot $a,0xf,$a
|
||||
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
|
||||
|
||||
srl $a,27,$t0
|
||||
and $b,$c,$t2
|
||||
and $b,$d,$t3
|
||||
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
|
||||
|
||||
sll $b,30,$b
|
||||
addl $t1,$e,$e
|
||||
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
|
||||
|
||||
srl @X[$j%16],31,$t1
|
||||
addl $t0,$e,$e
|
||||
or $t2,$t3,$t2
|
||||
and $c,$d,$t3
|
||||
|
||||
or $t2,$t3,$t2
|
||||
srl $b,32,$t3
|
||||
addl @X[$i%16],$e,$e
|
||||
addl @X[$j%16],@X[$j%16],@X[$j%16]
|
||||
|
||||
or $t3,$b,$b
|
||||
addl $t2,$e,$e
|
||||
or $t1,@X[$j%16],@X[$j%16]
|
||||
zapnot @X[$i%16],0xf,@X[$i%16]
|
||||
___
|
||||
}
|
||||
|
||||
$code=<<___;
|
||||
#include <asm.h>
|
||||
#include <regdef.h>
|
||||
|
||||
.text
|
||||
|
||||
.set noat
|
||||
.set noreorder
|
||||
.globl sha1_block_data_order
|
||||
.align 5
|
||||
.ent sha1_block_data_order
|
||||
sha1_block_data_order:
|
||||
lda sp,-64(sp)
|
||||
stq ra,0(sp)
|
||||
stq s0,8(sp)
|
||||
stq s1,16(sp)
|
||||
stq s2,24(sp)
|
||||
stq s3,32(sp)
|
||||
stq s4,40(sp)
|
||||
stq s5,48(sp)
|
||||
stq fp,56(sp)
|
||||
.mask 0x0400fe00,-64
|
||||
.frame sp,64,ra
|
||||
.prologue 0
|
||||
|
||||
ldl $A,0($ctx)
|
||||
ldl $B,4($ctx)
|
||||
sll $num,6,$num
|
||||
ldl $C,8($ctx)
|
||||
ldl $D,12($ctx)
|
||||
ldl $E,16($ctx)
|
||||
addq $inp,$num,$num
|
||||
|
||||
.Lloop:
|
||||
.set noreorder
|
||||
ldah $K,23170(zero)
|
||||
zapnot $B,0xf,$B
|
||||
lda $K,31129($K) # K_00_19
|
||||
___
|
||||
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
|
||||
|
||||
$code.=<<___;
|
||||
ldah $K,28378(zero)
|
||||
lda $K,-5215($K) # K_20_39
|
||||
___
|
||||
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||
|
||||
$code.=<<___;
|
||||
ldah $K,-28900(zero)
|
||||
lda $K,-17188($K) # K_40_59
|
||||
___
|
||||
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
|
||||
|
||||
$code.=<<___;
|
||||
ldah $K,-13725(zero)
|
||||
lda $K,-15914($K) # K_60_79
|
||||
___
|
||||
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||
|
||||
$code.=<<___;
|
||||
addl @X[0],$A,$A
|
||||
addl @X[1],$B,$B
|
||||
addl @X[2],$C,$C
|
||||
addl @X[3],$D,$D
|
||||
addl @X[4],$E,$E
|
||||
stl $A,0($ctx)
|
||||
stl $B,4($ctx)
|
||||
addq $inp,64,$inp
|
||||
stl $C,8($ctx)
|
||||
stl $D,12($ctx)
|
||||
stl $E,16($ctx)
|
||||
cmpult $inp,$num,$t1
|
||||
bne $t1,.Lloop
|
||||
|
||||
.set noreorder
|
||||
ldq ra,0(sp)
|
||||
ldq s0,8(sp)
|
||||
ldq s1,16(sp)
|
||||
ldq s2,24(sp)
|
||||
ldq s3,32(sp)
|
||||
ldq s4,40(sp)
|
||||
ldq s5,48(sp)
|
||||
ldq fp,56(sp)
|
||||
lda sp,64(sp)
|
||||
ret (ra)
|
||||
.end sha1_block_data_order
|
||||
___
|
||||
print $code;
|
||||
close STDOUT;
|
@ -86,8 +86,8 @@ $code.=<<___;
|
||||
ldr $t3,[$Xi,#2*4]
|
||||
add $e,$K,$e,ror#2 @ E+=K_xx_xx
|
||||
eor $t0,$t0,$t1
|
||||
eor $t2,$t2,$t3
|
||||
eor $t0,$t0,$t2
|
||||
eor $t0,$t0,$t3
|
||||
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
|
||||
___
|
||||
$code.=<<___ if (!defined($flag));
|
||||
@ -131,6 +131,15 @@ ___
|
||||
|
||||
sub BODY_40_59 {
|
||||
my ($a,$b,$c,$d,$e)=@_;
|
||||
if (1) {
|
||||
&Xupdate(@_);
|
||||
$code.=<<___;
|
||||
and $t2,$c,$d
|
||||
and $t1,$b,$t1,ror#2
|
||||
add $e,$e,$t2,ror#2
|
||||
add $e,$e,$t1 @ E+=F_40_59(B,C,D)
|
||||
___
|
||||
} else {
|
||||
&Xupdate(@_,1);
|
||||
$code.=<<___;
|
||||
and $t1,$b,$c,ror#2
|
||||
@ -140,6 +149,7 @@ $code.=<<___;
|
||||
add $e,$e,$t1 @ E+=F_40_59(B,C,D)
|
||||
___
|
||||
}
|
||||
}
|
||||
|
||||
$code=<<___;
|
||||
.text
|
||||
|
@ -15,7 +15,7 @@
|
||||
# is >50% better than HP C and >2x better than gcc.
|
||||
|
||||
$code=<<___;
|
||||
.ident \"sha1-ia64.s, version 1.2\"
|
||||
.ident \"sha1-ia64.s, version 1.3\"
|
||||
.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
|
||||
.explicit
|
||||
|
||||
@ -26,14 +26,10 @@ if ($^O eq "hpux") {
|
||||
$ADDP="addp4";
|
||||
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
|
||||
} else { $ADDP="add"; }
|
||||
for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
|
||||
$big_endian=0 if (/\-DL_ENDIAN/); }
|
||||
if (!defined($big_endian))
|
||||
{ $big_endian=(unpack('L',pack('N',1))==1); }
|
||||
|
||||
#$human=1;
|
||||
if ($human) { # useful for visual code auditing...
|
||||
($A,$B,$C,$D,$E,$T) = ("A","B","C","D","E","T");
|
||||
($A,$B,$C,$D,$E) = ("A","B","C","D","E");
|
||||
($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
|
||||
($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
|
||||
( "K_00_19","K_20_39","K_40_59","K_60_79" );
|
||||
@ -41,47 +37,50 @@ if ($human) { # useful for visual code auditing...
|
||||
"X8", "X9","X10","X11","X12","X13","X14","X15" );
|
||||
}
|
||||
else {
|
||||
($A,$B,$C,$D,$E,$T) = ("loc0","loc1","loc2","loc3","loc4","loc5");
|
||||
($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10");
|
||||
($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4");
|
||||
($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
|
||||
($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
|
||||
( "r14", "r15", "loc11", "loc12" );
|
||||
( "r14", "r15", "loc10", "loc11" );
|
||||
@X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
|
||||
"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" );
|
||||
}
|
||||
|
||||
sub BODY_00_15 {
|
||||
local *code=shift;
|
||||
local ($i,$a,$b,$c,$d,$e,$f)=@_;
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
my $Xn=@X[$j%16];
|
||||
|
||||
$code.=<<___ if ($i==0);
|
||||
{ .mmi; ld1 $X[$i&0xf]=[inp],2 // MSB
|
||||
{ .mmi; ld1 $X[$i]=[inp],2 // MSB
|
||||
ld1 tmp2=[tmp3],2 };;
|
||||
{ .mmi; ld1 tmp0=[inp],2
|
||||
ld1 tmp4=[tmp3],2 // LSB
|
||||
dep $X[$i&0xf]=$X[$i&0xf],tmp2,8,8 };;
|
||||
dep $X[$i]=$X[$i],tmp2,8,8 };;
|
||||
___
|
||||
if ($i<15) {
|
||||
$code.=<<___;
|
||||
{ .mmi; ld1 $X[($i+1)&0xf]=[inp],2 // +1
|
||||
{ .mmi; ld1 $Xn=[inp],2 // forward Xload
|
||||
nop.m 0x0
|
||||
dep tmp1=tmp0,tmp4,8,8 };;
|
||||
{ .mmi; ld1 tmp2=[tmp3],2 // +1
|
||||
{ .mmi; ld1 tmp2=[tmp3],2 // forward Xload
|
||||
and tmp4=$c,$b
|
||||
dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;;
|
||||
{ .mmi; andcm tmp1=$d,$b
|
||||
add tmp0=$e,$K_00_19
|
||||
dep $X[$i]=$X[$i],tmp1,16,16} //;;
|
||||
{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
|
||||
andcm tmp1=$d,$b
|
||||
dep.z tmp5=$a,5,27 };; // a<<5
|
||||
{ .mmi; or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
|
||||
add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19
|
||||
{ .mmi; add $e=$e,$X[$i] // e+=Xload
|
||||
or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
|
||||
extr.u tmp1=$a,27,5 };; // a>>27
|
||||
{ .mmi; ld1 tmp0=[inp],2 // +1
|
||||
add $f=$f,tmp4 // f+=F_00_19(b,c,d)
|
||||
{ .mmi; ld1 tmp0=[inp],2 // forward Xload
|
||||
add $e=$e,tmp4 // e+=F_00_19(b,c,d)
|
||||
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
|
||||
{ .mmi; ld1 tmp4=[tmp3],2 // +1
|
||||
{ .mmi; ld1 tmp4=[tmp3],2 // forward Xload
|
||||
or tmp5=tmp1,tmp5 // ROTATE(a,5)
|
||||
mux2 tmp6=$a,0x44 };; // see b in next iteration
|
||||
{ .mii; add $f=$f,tmp5 // f+=ROTATE(a,5)
|
||||
dep $X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8 // +1
|
||||
mux2 $X[$i&0xf]=$X[$i&0xf],0x44 } //;;
|
||||
{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)
|
||||
dep $Xn=$Xn,tmp2,8,8 // forward Xload
|
||||
mux2 $X[$i]=$X[$i],0x44 } //;;
|
||||
|
||||
___
|
||||
}
|
||||
@ -89,24 +88,24 @@ else {
|
||||
$code.=<<___;
|
||||
{ .mii; and tmp3=$c,$b
|
||||
dep tmp1=tmp0,tmp4,8,8;;
|
||||
dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;;
|
||||
{ .mmi; andcm tmp1=$d,$b
|
||||
add tmp0=$e,$K_00_19
|
||||
dep $X[$i]=$X[$i],tmp1,16,16} //;;
|
||||
{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
|
||||
andcm tmp1=$d,$b
|
||||
dep.z tmp5=$a,5,27 };; // a<<5
|
||||
{ .mmi; or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
|
||||
add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19
|
||||
{ .mmi; add $e=$e,$X[$i] // e+=Xupdate
|
||||
or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
|
||||
extr.u tmp1=$a,27,5 } // a>>27
|
||||
{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
|
||||
xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
|
||||
{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
|
||||
xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
|
||||
nop.i 0 };;
|
||||
{ .mmi; add $f=$f,tmp4 // f+=F_00_19(b,c,d)
|
||||
xor tmp2=tmp2,tmp3 // +1
|
||||
{ .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d)
|
||||
xor $Xn=$Xn,tmp3 // forward Xupdate
|
||||
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
|
||||
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
|
||||
mux2 tmp6=$a,0x44 };; // see b in next iteration
|
||||
{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5)
|
||||
shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
|
||||
mux2 $X[$i&0xf]=$X[$i&0xf],0x44 };;
|
||||
{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
|
||||
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
|
||||
mux2 $X[$i]=$X[$i],0x44 };;
|
||||
|
||||
___
|
||||
}
|
||||
@ -114,27 +113,28 @@ ___
|
||||
|
||||
sub BODY_16_19 {
|
||||
local *code=shift;
|
||||
local ($i,$a,$b,$c,$d,$e,$f)=@_;
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
my $Xn=@X[$j%16];
|
||||
|
||||
$code.=<<___;
|
||||
{ .mmi; mov $X[$i&0xf]=$f // Xupdate
|
||||
and tmp0=$c,$b
|
||||
{ .mib; add $e=$e,$K_00_19 // e+=K_00_19
|
||||
dep.z tmp5=$a,5,27 } // a<<5
|
||||
{ .mmi; andcm tmp1=$d,$b
|
||||
add tmp4=$e,$K_00_19 };;
|
||||
{ .mmi; or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
|
||||
add $f=$f,tmp4 // f+=e+K_00_19
|
||||
{ .mib; andcm tmp1=$d,$b
|
||||
and tmp0=$c,$b };;
|
||||
{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
|
||||
or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
|
||||
extr.u tmp1=$a,27,5 } // a>>27
|
||||
{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
|
||||
xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
|
||||
{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
|
||||
xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
|
||||
nop.i 0 };;
|
||||
{ .mmi; add $f=$f,tmp0 // f+=F_00_19(b,c,d)
|
||||
xor tmp2=tmp2,tmp3 // +1
|
||||
{ .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d)
|
||||
xor $Xn=$Xn,tmp3 // forward Xupdate
|
||||
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
|
||||
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
|
||||
mux2 tmp6=$a,0x44 };; // see b in next iteration
|
||||
{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5)
|
||||
shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
|
||||
{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
|
||||
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
|
||||
nop.i 0 };;
|
||||
|
||||
___
|
||||
@ -142,49 +142,47 @@ ___
|
||||
|
||||
sub BODY_20_39 {
|
||||
local *code=shift;
|
||||
local ($i,$a,$b,$c,$d,$e,$f,$Konst)=@_;
|
||||
my ($i,$a,$b,$c,$d,$e,$Konst)=@_;
|
||||
$Konst = $K_20_39 if (!defined($Konst));
|
||||
my $j=$i+1;
|
||||
my $Xn=@X[$j%16];
|
||||
|
||||
if ($i<79) {
|
||||
$code.=<<___;
|
||||
{ .mib; mov $X[$i&0xf]=$f // Xupdate
|
||||
{ .mib; add $e=$e,$Konst // e+=K_XX_XX
|
||||
dep.z tmp5=$a,5,27 } // a<<5
|
||||
{ .mib; xor tmp0=$c,$b
|
||||
add tmp4=$e,$Konst };;
|
||||
{ .mmi; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
|
||||
add $f=$f,tmp4 // f+=e+K_20_39
|
||||
xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate
|
||||
{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
|
||||
extr.u tmp1=$a,27,5 } // a>>27
|
||||
{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
|
||||
xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
|
||||
nop.i 0 };;
|
||||
{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d)
|
||||
xor tmp2=tmp2,tmp3 // +1
|
||||
{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
|
||||
xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate
|
||||
{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
|
||||
xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate
|
||||
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
|
||||
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
|
||||
mux2 tmp6=$a,0x44 };; // see b in next iteration
|
||||
{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5)
|
||||
shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
|
||||
{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
|
||||
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
|
||||
nop.i 0 };;
|
||||
|
||||
___
|
||||
}
|
||||
else {
|
||||
$code.=<<___;
|
||||
{ .mib; mov $X[$i&0xf]=$f // Xupdate
|
||||
{ .mib; add $e=$e,$Konst // e+=K_60_79
|
||||
dep.z tmp5=$a,5,27 } // a<<5
|
||||
{ .mib; xor tmp0=$c,$b
|
||||
add tmp4=$e,$Konst };;
|
||||
{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
|
||||
extr.u tmp1=$a,27,5 } // a>>27
|
||||
{ .mib; add $f=$f,tmp4 // f+=e+K_20_39
|
||||
add $h1=$h1,$a };; // wrap up
|
||||
{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d)
|
||||
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) ;;?
|
||||
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
|
||||
{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
|
||||
extr.u tmp1=$a,27,5 } // a>>27
|
||||
{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
|
||||
add $h3=$h3,$c };; // wrap up
|
||||
{ .mib; add tmp3=1,inp // used in unaligned codepath
|
||||
add $f=$f,tmp1 } // f+=ROTATE(a,5)
|
||||
{ .mib; add $h2=$h2,$b // wrap up
|
||||
{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
|
||||
or tmp1=tmp1,tmp5 // ROTATE(a,5)
|
||||
shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;?
|
||||
{ .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5)
|
||||
add tmp3=1,inp // used in unaligned codepath
|
||||
add $h4=$h4,$d };; // wrap up
|
||||
|
||||
___
|
||||
@ -193,29 +191,29 @@ ___
|
||||
|
||||
sub BODY_40_59 {
|
||||
local *code=shift;
|
||||
local ($i,$a,$b,$c,$d,$e,$f)=@_;
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
my $Xn=@X[$j%16];
|
||||
|
||||
$code.=<<___;
|
||||
{ .mmi; mov $X[$i&0xf]=$f // Xupdate
|
||||
and tmp0=$c,$b
|
||||
{ .mib; add $e=$e,$K_40_59 // e+=K_40_59
|
||||
dep.z tmp5=$a,5,27 } // a<<5
|
||||
{ .mmi; and tmp1=$d,$b
|
||||
add tmp4=$e,$K_40_59 };;
|
||||
{ .mmi; or tmp0=tmp0,tmp1 // (b&c)|(b&d)
|
||||
add $f=$f,tmp4 // f+=e+K_40_59
|
||||
{ .mib; and tmp1=$c,$d
|
||||
xor tmp0=$c,$d };;
|
||||
{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
|
||||
add tmp5=tmp5,tmp1 // a<<5+(c&d)
|
||||
extr.u tmp1=$a,27,5 } // a>>27
|
||||
{ .mmi; and tmp4=$c,$d
|
||||
xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
|
||||
xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
|
||||
};;
|
||||
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
|
||||
xor tmp2=tmp2,tmp3 // +1
|
||||
{ .mmi; and tmp0=tmp0,$b
|
||||
xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
|
||||
xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate
|
||||
{ .mmi; add $e=$e,tmp0 // e+=b&(c^d)
|
||||
add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d)
|
||||
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
|
||||
{ .mmi; or tmp0=tmp0,tmp4 // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d)
|
||||
{ .mmi; xor $Xn=$Xn,tmp3
|
||||
mux2 tmp6=$a,0x44 };; // see b in next iteration
|
||||
{ .mii; add $f=$f,tmp0 // f+=F_40_59(b,c,d)
|
||||
shrp $e=tmp2,tmp2,31;; // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
|
||||
add $f=$f,tmp1 };; // f+=ROTATE(a,5)
|
||||
{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d)
|
||||
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
|
||||
nop.i 0x0 };;
|
||||
|
||||
___
|
||||
}
|
||||
@ -237,7 +235,7 @@ inp=r33; // in1
|
||||
.align 32
|
||||
sha1_block_data_order:
|
||||
.prologue
|
||||
{ .mmi; alloc tmp1=ar.pfs,3,15,0,0
|
||||
{ .mmi; alloc tmp1=ar.pfs,3,14,0,0
|
||||
$ADDP tmp0=4,ctx
|
||||
.save ar.lc,r3
|
||||
mov r3=ar.lc }
|
||||
@ -245,8 +243,8 @@ sha1_block_data_order:
|
||||
$ADDP inp=0,inp
|
||||
mov r2=pr };;
|
||||
tmp4=in2;
|
||||
tmp5=loc13;
|
||||
tmp6=loc14;
|
||||
tmp5=loc12;
|
||||
tmp6=loc13;
|
||||
.body
|
||||
{ .mlx; ld4 $h0=[ctx],8
|
||||
movl $K_00_19=0x5a827999 }
|
||||
@ -273,7 +271,7 @@ tmp6=loc14;
|
||||
|
||||
___
|
||||
|
||||
{ my $i,@V=($A,$B,$C,$D,$E,$T);
|
||||
{ my $i,@V=($A,$B,$C,$D,$E);
|
||||
|
||||
for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
|
||||
for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
|
||||
@ -281,12 +279,12 @@ ___
|
||||
for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
|
||||
for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
|
||||
|
||||
(($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check
|
||||
(($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
{ .mmb; add $h0=$h0,$E
|
||||
nop.m 0
|
||||
{ .mmb; add $h0=$h0,$A
|
||||
add $h2=$h2,$C
|
||||
br.ctop.dptk.many .Ldtop };;
|
||||
.Ldend:
|
||||
{ .mmi; add tmp0=4,ctx
|
||||
|
281
crypto/sha/asm/sha1-mips.pl
Normal file
281
crypto/sha/asm/sha1-mips.pl
Normal file
@ -0,0 +1,281 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# SHA1 block procedure for MIPS.
|
||||
|
||||
# Performance improvement is 30% on unaligned input. The "secret" is
|
||||
# to deploy lwl/lwr pair to load unaligned input. One could have
|
||||
# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
|
||||
# compatible subroutine. There is room for minor optimization on
|
||||
# little-endian platforms...
|
||||
#
|
||||
# The code is somewhat IRIX-centric, i.e. is likely to require minor
|
||||
# adaptations for other OSes...
|
||||
|
||||
for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
|
||||
$big_endian=0 if (/\-DL_ENDIAN/); }
|
||||
if (!defined($big_endian))
|
||||
{ $big_endian=(unpack('L',pack('N',1))==1); }
|
||||
|
||||
# offsets of the Most and Least Significant Bytes
|
||||
$MSB=$big_endian?0:3;
|
||||
$LSB=3&~$MSB;
|
||||
|
||||
@X=( "\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15",
|
||||
"\$16", "\$17", "\$18", "\$19", "\$20", "\$21", "\$22", "\$23");
|
||||
$ctx="\$4"; # a0
|
||||
$inp="\$5"; # a1
|
||||
$num="\$6"; # a2
|
||||
$A="\$1";
|
||||
$B="\$2";
|
||||
$C="\$3";
|
||||
$D="\$7";
|
||||
$E="\$24"; @V=($A,$B,$C,$D,$E);
|
||||
$t0="\$25"; # jp,t9
|
||||
$t1="\$28"; # gp
|
||||
$t2="\$30"; # fp,s8
|
||||
$K="\$31"; # ra
|
||||
|
||||
$FRAMESIZE=16;
|
||||
|
||||
sub BODY_00_14 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
$code.=<<___ if (!$big_endian);
|
||||
srl $t0,@X[$i],24 # byte swap($i)
|
||||
srl $t1,@X[$i],8
|
||||
andi $t2,@X[$i],0xFF00
|
||||
sll @X[$i],@X[$i],24
|
||||
andi $t1,0xFF00
|
||||
sll $t2,$t2,8
|
||||
or @X[$i],$t0
|
||||
or @X[$i],$t1
|
||||
or @X[$i],$t2
|
||||
___
|
||||
$code.=<<___;
|
||||
lwl @X[$j],$j*4+$MSB($inp)
|
||||
sll $t0,$a,5 # $i
|
||||
addu $e,$K
|
||||
lwr @X[$j],$j*4+$LSB($inp)
|
||||
srl $t1,$a,27
|
||||
addu $e,$t0
|
||||
xor $t0,$c,$d
|
||||
addu $e,$t1
|
||||
sll $t2,$b,30
|
||||
and $t0,$b
|
||||
srl $b,$b,2
|
||||
xor $t0,$d
|
||||
addu $e,@X[$i]
|
||||
or $b,$t2
|
||||
addu $e,$t0
|
||||
___
|
||||
}
|
||||
|
||||
sub BODY_15_19 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
|
||||
$code.=<<___ if (!$big_endian && $i==15);
|
||||
srl $t0,@X[$i],24 # byte swap($i)
|
||||
srl $t1,@X[$i],8
|
||||
andi $t2,@X[$i],0xFF00
|
||||
sll @X[$i],@X[$i],24
|
||||
andi $t1,0xFF00
|
||||
sll $t2,$t2,8
|
||||
or @X[$i],$t0
|
||||
or @X[$i],$t1
|
||||
or @X[$i],$t2
|
||||
___
|
||||
$code.=<<___;
|
||||
xor @X[$j%16],@X[($j+2)%16]
|
||||
sll $t0,$a,5 # $i
|
||||
addu $e,$K
|
||||
srl $t1,$a,27
|
||||
addu $e,$t0
|
||||
xor @X[$j%16],@X[($j+8)%16]
|
||||
xor $t0,$c,$d
|
||||
addu $e,$t1
|
||||
xor @X[$j%16],@X[($j+13)%16]
|
||||
sll $t2,$b,30
|
||||
and $t0,$b
|
||||
srl $t1,@X[$j%16],31
|
||||
addu @X[$j%16],@X[$j%16]
|
||||
srl $b,$b,2
|
||||
xor $t0,$d
|
||||
or @X[$j%16],$t1
|
||||
addu $e,@X[$i%16]
|
||||
or $b,$t2
|
||||
addu $e,$t0
|
||||
___
|
||||
}
|
||||
|
||||
sub BODY_20_39 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
$code.=<<___ if ($i<79);
|
||||
xor @X[$j%16],@X[($j+2)%16]
|
||||
sll $t0,$a,5 # $i
|
||||
addu $e,$K
|
||||
srl $t1,$a,27
|
||||
addu $e,$t0
|
||||
xor @X[$j%16],@X[($j+8)%16]
|
||||
xor $t0,$c,$d
|
||||
addu $e,$t1
|
||||
xor @X[$j%16],@X[($j+13)%16]
|
||||
sll $t2,$b,30
|
||||
xor $t0,$b
|
||||
srl $t1,@X[$j%16],31
|
||||
addu @X[$j%16],@X[$j%16]
|
||||
srl $b,$b,2
|
||||
addu $e,@X[$i%16]
|
||||
or @X[$j%16],$t1
|
||||
or $b,$t2
|
||||
addu $e,$t0
|
||||
___
|
||||
$code.=<<___ if ($i==79);
|
||||
lw @X[0],0($ctx)
|
||||
sll $t0,$a,5 # $i
|
||||
addu $e,$K
|
||||
lw @X[1],4($ctx)
|
||||
srl $t1,$a,27
|
||||
addu $e,$t0
|
||||
lw @X[2],8($ctx)
|
||||
xor $t0,$c,$d
|
||||
addu $e,$t1
|
||||
lw @X[3],12($ctx)
|
||||
sll $t2,$b,30
|
||||
xor $t0,$b
|
||||
lw @X[4],16($ctx)
|
||||
srl $b,$b,2
|
||||
addu $e,@X[$i%16]
|
||||
or $b,$t2
|
||||
addu $e,$t0
|
||||
___
|
||||
}
|
||||
|
||||
sub BODY_40_59 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
$code.=<<___ if ($i<79);
|
||||
xor @X[$j%16],@X[($j+2)%16]
|
||||
sll $t0,$a,5 # $i
|
||||
addu $e,$K
|
||||
srl $t1,$a,27
|
||||
addu $e,$t0
|
||||
xor @X[$j%16],@X[($j+8)%16]
|
||||
and $t0,$c,$d
|
||||
addu $e,$t1
|
||||
xor @X[$j%16],@X[($j+13)%16]
|
||||
sll $t2,$b,30
|
||||
addu $e,$t0
|
||||
srl $t1,@X[$j%16],31
|
||||
xor $t0,$c,$d
|
||||
addu @X[$j%16],@X[$j%16]
|
||||
and $t0,$b
|
||||
srl $b,$b,2
|
||||
or @X[$j%16],$t1
|
||||
addu $e,@X[$i%16]
|
||||
or $b,$t2
|
||||
addu $e,$t0
|
||||
___
|
||||
}
|
||||
|
||||
$code=<<___;
|
||||
#include <asm.h>
|
||||
#include <regdef.h>
|
||||
|
||||
.text
|
||||
|
||||
.set noat
|
||||
.set noreorder
|
||||
.align 5
|
||||
.globl sha1_block_data_order
|
||||
.ent sha1_block_data_order
|
||||
sha1_block_data_order:
|
||||
.frame sp,$FRAMESIZE*SZREG,zero
|
||||
.mask 0xd0ff0000,-$FRAMESIZE*SZREG
|
||||
.set noreorder
|
||||
PTR_SUB sp,$FRAMESIZE*SZREG
|
||||
REG_S \$31,($FRAMESIZE-1)*SZREG(sp)
|
||||
REG_S \$30,($FRAMESIZE-2)*SZREG(sp)
|
||||
REG_S \$28,($FRAMESIZE-3)*SZREG(sp)
|
||||
REG_S \$23,($FRAMESIZE-4)*SZREG(sp)
|
||||
REG_S \$22,($FRAMESIZE-5)*SZREG(sp)
|
||||
REG_S \$21,($FRAMESIZE-6)*SZREG(sp)
|
||||
REG_S \$20,($FRAMESIZE-7)*SZREG(sp)
|
||||
REG_S \$19,($FRAMESIZE-8)*SZREG(sp)
|
||||
REG_S \$18,($FRAMESIZE-9)*SZREG(sp)
|
||||
REG_S \$17,($FRAMESIZE-10)*SZREG(sp)
|
||||
REG_S \$16,($FRAMESIZE-11)*SZREG(sp)
|
||||
|
||||
lw $A,0($ctx)
|
||||
lw $B,4($ctx)
|
||||
lw $C,8($ctx)
|
||||
lw $D,12($ctx)
|
||||
b .Loop
|
||||
lw $E,16($ctx)
|
||||
.align 4
|
||||
.Loop:
|
||||
.set reorder
|
||||
lwl @X[0],$MSB($inp)
|
||||
lui $K,0x5a82
|
||||
lwr @X[0],$LSB($inp)
|
||||
ori $K,0x7999 # K_00_19
|
||||
___
|
||||
for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
|
||||
for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
lui $K,0x6ed9
|
||||
ori $K,0xeba1 # K_20_39
|
||||
___
|
||||
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
lui $K,0x8f1b
|
||||
ori $K,0xbcdc # K_40_59
|
||||
___
|
||||
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
lui $K,0xca62
|
||||
ori $K,0xc1d6 # K_60_79
|
||||
___
|
||||
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
addu $A,$X[0]
|
||||
addu $B,$X[1]
|
||||
sw $A,0($ctx)
|
||||
addu $C,$X[2]
|
||||
addu $D,$X[3]
|
||||
sw $B,4($ctx)
|
||||
addu $E,$X[4]
|
||||
PTR_SUB $num,1
|
||||
sw $C,8($ctx)
|
||||
sw $D,12($ctx)
|
||||
sw $E,16($ctx)
|
||||
.set noreorder
|
||||
bnez $num,.Loop
|
||||
PTR_ADD $inp,64
|
||||
|
||||
.set noreorder
|
||||
REG_L \$31,($FRAMESIZE-1)*SZREG(sp)
|
||||
REG_L \$30,($FRAMESIZE-2)*SZREG(sp)
|
||||
REG_L \$28,($FRAMESIZE-3)*SZREG(sp)
|
||||
REG_L \$23,($FRAMESIZE-4)*SZREG(sp)
|
||||
REG_L \$22,($FRAMESIZE-5)*SZREG(sp)
|
||||
REG_L \$21,($FRAMESIZE-6)*SZREG(sp)
|
||||
REG_L \$20,($FRAMESIZE-7)*SZREG(sp)
|
||||
REG_L \$19,($FRAMESIZE-8)*SZREG(sp)
|
||||
REG_L \$18,($FRAMESIZE-9)*SZREG(sp)
|
||||
REG_L \$17,($FRAMESIZE-10)*SZREG(sp)
|
||||
REG_L \$16,($FRAMESIZE-11)*SZREG(sp)
|
||||
jr ra
|
||||
PTR_ADD sp,$FRAMESIZE*SZREG
|
||||
.end sha1_block_data_order
|
||||
___
|
||||
print $code;
|
||||
close STDOUT;
|
259
crypto/sha/asm/sha1-parisc.pl
Normal file
259
crypto/sha/asm/sha1-parisc.pl
Normal file
@ -0,0 +1,259 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# SHA1 block procedure for PA-RISC.
|
||||
|
||||
# June 2009.
|
||||
#
|
||||
# On PA-7100LC performance is >30% better than gcc 3.2 generated code
|
||||
# for aligned input and >50% better for unaligned. Compared to vendor
|
||||
# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
|
||||
# few percent faster in 32-bit one (this for aligned input, data for
|
||||
# unaligned input is not available).
|
||||
#
|
||||
# Special thanks to polarhome.com for providing HP-UX account.
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
open STDOUT,">$output";
|
||||
|
||||
if ($flavour =~ /64/) {
|
||||
$LEVEL ="2.0W";
|
||||
$SIZE_T =8;
|
||||
$FRAME_MARKER =80;
|
||||
$SAVED_RP =16;
|
||||
$PUSH ="std";
|
||||
$PUSHMA ="std,ma";
|
||||
$POP ="ldd";
|
||||
$POPMB ="ldd,mb";
|
||||
} else {
|
||||
$LEVEL ="1.0";
|
||||
$SIZE_T =4;
|
||||
$FRAME_MARKER =48;
|
||||
$SAVED_RP =20;
|
||||
$PUSH ="stw";
|
||||
$PUSHMA ="stwm";
|
||||
$POP ="ldw";
|
||||
$POPMB ="ldwm";
|
||||
}
|
||||
|
||||
$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
|
||||
# [+ argument transfer]
|
||||
$ctx="%r26"; # arg0
|
||||
$inp="%r25"; # arg1
|
||||
$num="%r24"; # arg2
|
||||
|
||||
$t0="%r28";
|
||||
$t1="%r29";
|
||||
$K="%r31";
|
||||
|
||||
@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
|
||||
"%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
|
||||
|
||||
@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
|
||||
|
||||
sub BODY_00_19 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
$code.=<<___ if ($i<15);
|
||||
addl $K,$e,$e ; $i
|
||||
shd $a,$a,27,$t1
|
||||
addl @X[$i],$e,$e
|
||||
and $c,$b,$t0
|
||||
addl $t1,$e,$e
|
||||
andcm $d,$b,$t1
|
||||
shd $b,$b,2,$b
|
||||
or $t1,$t0,$t0
|
||||
addl $t0,$e,$e
|
||||
___
|
||||
$code.=<<___ if ($i>=15); # with forward Xupdate
|
||||
addl $K,$e,$e ; $i
|
||||
shd $a,$a,27,$t1
|
||||
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
|
||||
addl @X[$i%16],$e,$e
|
||||
and $c,$b,$t0
|
||||
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
|
||||
addl $t1,$e,$e
|
||||
andcm $d,$b,$t1
|
||||
shd $b,$b,2,$b
|
||||
or $t1,$t0,$t0
|
||||
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
|
||||
add $t0,$e,$e
|
||||
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
|
||||
___
|
||||
}
|
||||
|
||||
sub BODY_20_39 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
$code.=<<___ if ($i<79);
|
||||
xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i
|
||||
addl $K,$e,$e
|
||||
shd $a,$a,27,$t1
|
||||
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
|
||||
addl @X[$i%16],$e,$e
|
||||
xor $b,$c,$t0
|
||||
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
|
||||
addl $t1,$e,$e
|
||||
shd $b,$b,2,$b
|
||||
xor $d,$t0,$t0
|
||||
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
|
||||
addl $t0,$e,$e
|
||||
___
|
||||
$code.=<<___ if ($i==79); # with context load
|
||||
ldw 0($ctx),@X[0] ; $i
|
||||
addl $K,$e,$e
|
||||
shd $a,$a,27,$t1
|
||||
ldw 4($ctx),@X[1]
|
||||
addl @X[$i%16],$e,$e
|
||||
xor $b,$c,$t0
|
||||
ldw 8($ctx),@X[2]
|
||||
addl $t1,$e,$e
|
||||
shd $b,$b,2,$b
|
||||
xor $d,$t0,$t0
|
||||
ldw 12($ctx),@X[3]
|
||||
addl $t0,$e,$e
|
||||
ldw 16($ctx),@X[4]
|
||||
___
|
||||
}
|
||||
|
||||
sub BODY_40_59 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
$code.=<<___;
|
||||
shd $a,$a,27,$t1 ; $i
|
||||
addl $K,$e,$e
|
||||
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
|
||||
xor $d,$c,$t0
|
||||
addl @X[$i%16],$e,$e
|
||||
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
|
||||
and $b,$t0,$t0
|
||||
addl $t1,$e,$e
|
||||
shd $b,$b,2,$b
|
||||
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
|
||||
addl $t0,$e,$e
|
||||
and $d,$c,$t1
|
||||
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
|
||||
addl $t1,$e,$e
|
||||
___
|
||||
}
|
||||
|
||||
$code=<<___;
|
||||
.LEVEL $LEVEL
|
||||
.SPACE \$TEXT\$
|
||||
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
|
||||
|
||||
.EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
|
||||
sha1_block_data_order
|
||||
.PROC
|
||||
.CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
|
||||
.ENTRY
|
||||
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
||||
$PUSHMA %r3,$FRAME(%sp)
|
||||
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
||||
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
||||
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
||||
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
|
||||
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
|
||||
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
|
||||
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
|
||||
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
|
||||
$PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
|
||||
$PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
|
||||
$PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
|
||||
$PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
|
||||
$PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
|
||||
|
||||
ldw 0($ctx),$A
|
||||
ldw 4($ctx),$B
|
||||
ldw 8($ctx),$C
|
||||
ldw 12($ctx),$D
|
||||
ldw 16($ctx),$E
|
||||
|
||||
extru $inp,31,2,$t0 ; t0=inp&3;
|
||||
sh3addl $t0,%r0,$t0 ; t0*=8;
|
||||
subi 32,$t0,$t0 ; t0=32-t0;
|
||||
mtctl $t0,%cr11 ; %sar=t0;
|
||||
|
||||
L\$oop
|
||||
ldi 3,$t0
|
||||
andcm $inp,$t0,$t0 ; 64-bit neutral
|
||||
___
|
||||
for ($i=0;$i<15;$i++) { # load input block
|
||||
$code.="\tldw `4*$i`($t0),@X[$i]\n"; }
|
||||
$code.=<<___;
|
||||
cmpb,*= $inp,$t0,L\$aligned
|
||||
ldw 60($t0),@X[15]
|
||||
ldw 64($t0),@X[16]
|
||||
___
|
||||
for ($i=0;$i<16;$i++) { # align input
|
||||
$code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; }
|
||||
$code.=<<___;
|
||||
L\$aligned
|
||||
ldil L'0x5a827000,$K ; K_00_19
|
||||
ldo 0x999($K),$K
|
||||
___
|
||||
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
ldil L'0x6ed9e000,$K ; K_20_39
|
||||
ldo 0xba1($K),$K
|
||||
___
|
||||
|
||||
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
ldil L'0x8f1bb000,$K ; K_40_59
|
||||
ldo 0xcdc($K),$K
|
||||
___
|
||||
|
||||
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
ldil L'0xca62c000,$K ; K_60_79
|
||||
ldo 0x1d6($K),$K
|
||||
___
|
||||
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||
|
||||
$code.=<<___;
|
||||
addl @X[0],$A,$A
|
||||
addl @X[1],$B,$B
|
||||
addl @X[2],$C,$C
|
||||
addl @X[3],$D,$D
|
||||
addl @X[4],$E,$E
|
||||
stw $A,0($ctx)
|
||||
stw $B,4($ctx)
|
||||
stw $C,8($ctx)
|
||||
stw $D,12($ctx)
|
||||
stw $E,16($ctx)
|
||||
addib,*<> -1,$num,L\$oop
|
||||
ldo 64($inp),$inp
|
||||
|
||||
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
|
||||
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
||||
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
||||
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
||||
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
|
||||
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
|
||||
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
|
||||
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
|
||||
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
|
||||
$POP `-$FRAME+9*$SIZE_T`(%sp),%r12
|
||||
$POP `-$FRAME+10*$SIZE_T`(%sp),%r13
|
||||
$POP `-$FRAME+11*$SIZE_T`(%sp),%r14
|
||||
$POP `-$FRAME+12*$SIZE_T`(%sp),%r15
|
||||
$POP `-$FRAME+13*$SIZE_T`(%sp),%r16
|
||||
bv (%r2)
|
||||
.EXIT
|
||||
$POPMB -$FRAME(%sp),%r3
|
||||
.PROCEND
|
||||
.STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
$code =~ s/,\*/,/gm if ($SIZE_T==4);
|
||||
print $code;
|
||||
close STDOUT;
|
Loading…
x
Reference in New Issue
Block a user