RIPEMD160 shape-up Intel assembler companion. Cycle counter benchmarks

went down from 1050 to 921 cycles on Pentium II. I haven't checked the
figures on Pentium yet.
This commit is contained in:
Andy Polyakov 1999-08-28 13:07:51 +00:00
parent 28e0be13f6
commit 2d0c55eda2
3 changed files with 1955 additions and 1942 deletions

View File

@ -34,6 +34,8 @@ void GetTSC(unsigned long& tsc)
#include <stdlib.h> #include <stdlib.h>
#include <openssl/ripemd.h> #include <openssl/ripemd.h>
#define ripemd160_block_x86 ripemd160_block_asm_host_order
extern "C" { extern "C" {
void ripemd160_block_x86(RIPEMD160_CTX *ctx, unsigned char *buffer,int num); void ripemd160_block_x86(RIPEMD160_CTX *ctx, unsigned char *buffer,int num);
} }
@ -55,8 +57,10 @@ void main(int argc,char *argv[])
if (num == 0) num=16; if (num == 0) num=16;
if (num > 250) num=16; if (num > 250) num=16;
numm=num+2; numm=num+2;
#if 0
num*=64; num*=64;
numm*=64; numm*=64;
#endif
for (j=0; j<6; j++) for (j=0; j<6; j++)
{ {
@ -71,7 +75,7 @@ void main(int argc,char *argv[])
GetTSC(e2); GetTSC(e2);
ripemd160_block_x86(&ctx,buffer,num); ripemd160_block_x86(&ctx,buffer,num);
} }
printf("ripemd160 (%d bytes) %d %d (%.2f)\n",num, printf("ripemd160 (%d bytes) %d %d (%.2f)\n",num*64,
e1-s1,e2-s2,(double)((e1-s1)-(e2-s2))/2); e1-s1,e2-s2,(double)((e1-s1)-(e2-s2))/2);
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,7 @@
#!/usr/local/bin/perl #!/usr/local/bin/perl
# Normal is the # Normal is the
# ripemd160_block_x86(MD5_CTX *c, ULONG *X); # ripemd160_block_asm_host_order(RIPEMD160_CTX *c, ULONG *X,int blocks);
# version, non-normal is the
# ripemd160_block_x86(MD5_CTX *c, ULONG *X,int blocks);
$normal=0; $normal=0;
@ -12,13 +10,13 @@ require "x86asm.pl";
&asm_init($ARGV[0],$0); &asm_init($ARGV[0],$0);
$A="eax"; $A="ecx";
$B="ebx"; $B="esi";
$C="ecx"; $C="edi";
$D="edx"; $D="ebx";
$E="ebp"; $E="ebp";
$tmp1="esi"; $tmp1="eax";
$tmp2="edi"; $tmp2="edx";
$KL1=0x5A827999; $KL1=0x5A827999;
$KL2=0x6ED9EBA1; $KL2=0x6ED9EBA1;
@ -58,13 +56,13 @@ $KR3=0x7A6D76E9;
8, 5,12, 9,12, 5,14, 6, 8,13, 6, 5,15,13,11,11, 8, 5,12, 9,12, 5,14, 6, 8,13, 6, 5,15,13,11,11,
); );
&ripemd160_block("ripemd160_block_x86"); &ripemd160_block("ripemd160_block_asm_host_order");
&asm_finish(); &asm_finish();
sub Xv sub Xv
{ {
local($n)=@_; local($n)=@_;
return(&swtmp($n+1)); return(&swtmp($n));
# tmp on stack # tmp on stack
} }
@ -82,7 +80,7 @@ sub RIP1
&comment($p++); &comment($p++);
if ($p & 1) if ($p & 1)
{ {
&mov($tmp1, $c) if $o == -1; #&mov($tmp1, $c) if $o == -1;
&xor($tmp1, $d) if $o == -1; &xor($tmp1, $d) if $o == -1;
&mov($tmp2, &Xv($pos)); &mov($tmp2, &Xv($pos));
&xor($tmp1, $b); &xor($tmp1, $b);
@ -290,7 +288,7 @@ sub RIP5
&rotl($c, 10); &rotl($c, 10);
&lea($a, &DWP($K,$a,$tmp1,1)); &lea($a, &DWP($K,$a,$tmp1,1));
&sub($tmp2, &Np($d)) if $o <= 0; &sub($tmp2, &Np($d)) if $o <= 0;
&mov(&swtmp(1+16), $A) if $o == 1; &mov(&swtmp(16), $A) if $o == 1;
&mov($tmp1, &Np($d)) if $o == 2; &mov($tmp1, &Np($d)) if $o == 2;
&rotl($a, $s); &rotl($a, $s);
&add($a, $e); &add($a, $e);
@ -310,19 +308,25 @@ sub ripemd160_block
# D 12 # D 12
# E 16 # E 16
&mov($tmp2, &wparam(0));
&mov($tmp1, &wparam(1));
&push("esi"); &push("esi");
&mov($C, &wparam(2)); &mov($A, &DWP( 0,$tmp2,"",0));
&push("edi"); &push("edi");
&mov($tmp1, &wparam(1)); # edi &mov($B, &DWP( 4,$tmp2,"",0));
&push("ebp"); &push("ebp");
&add($C, $tmp1); # offset we end at &mov($C, &DWP( 8,$tmp2,"",0));
&push("ebx"); &push("ebx");
&sub($C, 64); &stack_push(16+5+6);
&stack_push(16+5+1); # Special comment about the figure of 6.
# XXX # Idea is to pad the current frame so
# that the top of the stack gets fairly
&mov(&swtmp(0), $C); # aligned. Well, as you realize it would
&mov($tmp2, &wparam(0)); # Done at end of loop # always depend on how the frame below is
# aligned. The good news are that gcc-2.95
# and later does keep first argument at
# least double-wise aligned.
# <appro@fy.chalmers.se>
&set_label("start") unless $normal; &set_label("start") unless $normal;
&comment(""); &comment("");
@ -332,16 +336,12 @@ sub ripemd160_block
for ($z=0; $z<16; $z+=2) for ($z=0; $z<16; $z+=2)
{ {
&mov($A, &DWP( $z*4,$tmp1,"",0)); &mov($D, &DWP( $z*4,$tmp1,"",0));
&mov($B, &DWP( ($z+1)*4,$tmp1,"",0)); &mov($E, &DWP( ($z+1)*4,$tmp1,"",0));
&mov(&swtmp(1+$z), $A); &mov(&swtmp($z), $D);
&mov(&swtmp(1+$z+1), $B); &mov(&swtmp($z+1), $E);
} }
&add($tmp1, 64); &mov($tmp1, $C);
&mov($A, &DWP( 0,$tmp2,"",0));
&mov(&wparam(1),$tmp1);
&mov($B, &DWP( 4,$tmp2,"",0));
&mov($C, &DWP( 8,$tmp2,"",0));
&mov($D, &DWP(12,$tmp2,"",0)); &mov($D, &DWP(12,$tmp2,"",0));
&mov($E, &DWP(16,$tmp2,"",0)); &mov($E, &DWP(16,$tmp2,"",0));
@ -431,14 +431,14 @@ sub ripemd160_block
&RIP5($B,$C,$D,$E,$A,$wl[79],$sl[79],$KL4,1); &RIP5($B,$C,$D,$E,$A,$wl[79],$sl[79],$KL4,1);
# &mov($tmp2, &wparam(0)); # moved into last RIP5 # &mov($tmp2, &wparam(0)); # moved into last RIP5
# &mov(&swtmp(1+16), $A); # &mov(&swtmp(16), $A);
&mov($A, &DWP( 0,$tmp2,"",0)); &mov($A, &DWP( 0,$tmp2,"",0));
&mov(&swtmp(1+17), $B); &mov(&swtmp(16+1), $B);
&mov(&swtmp(1+18), $C); &mov(&swtmp(16+2), $C);
&mov($B, &DWP( 4,$tmp2,"",0)); &mov($B, &DWP( 4,$tmp2,"",0));
&mov(&swtmp(1+19), $D); &mov(&swtmp(16+3), $D);
&mov($C, &DWP( 8,$tmp2,"",0)); &mov($C, &DWP( 8,$tmp2,"",0));
&mov(&swtmp(1+20), $E); &mov(&swtmp(16+4), $E);
&mov($D, &DWP(12,$tmp2,"",0)); &mov($D, &DWP(12,$tmp2,"",0));
&mov($E, &DWP(16,$tmp2,"",0)); &mov($E, &DWP(16,$tmp2,"",0));
@ -531,46 +531,54 @@ sub ripemd160_block
&mov($tmp1, &DWP( 4,$tmp2,"",0)); # ctx->B &mov($tmp1, &DWP( 4,$tmp2,"",0)); # ctx->B
&add($D, $tmp1); &add($D, $tmp1);
&mov($tmp1, &swtmp(1+18)); # $c &mov($tmp1, &swtmp(16+2)); # $c
&add($D, $tmp1); &add($D, $tmp1);
&mov($tmp1, &DWP( 8,$tmp2,"",0)); # ctx->C &mov($tmp1, &DWP( 8,$tmp2,"",0)); # ctx->C
&add($E, $tmp1); &add($E, $tmp1);
&mov($tmp1, &swtmp(1+19)); # $d &mov($tmp1, &swtmp(16+3)); # $d
&add($E, $tmp1); &add($E, $tmp1);
&mov($tmp1, &DWP(12,$tmp2,"",0)); # ctx->D &mov($tmp1, &DWP(12,$tmp2,"",0)); # ctx->D
&add($A, $tmp1); &add($A, $tmp1);
&mov($tmp1, &swtmp(1+20)); # $e &mov($tmp1, &swtmp(16+4)); # $e
&add($A, $tmp1); &add($A, $tmp1);
&mov($tmp1, &DWP(16,$tmp2,"",0)); # ctx->E &mov($tmp1, &DWP(16,$tmp2,"",0)); # ctx->E
&add($B, $tmp1); &add($B, $tmp1);
&mov($tmp1, &swtmp(1+16)); # $a &mov($tmp1, &swtmp(16+0)); # $a
&add($B, $tmp1); &add($B, $tmp1);
&mov($tmp1, &DWP( 0,$tmp2,"",0)); # ctx->A &mov($tmp1, &DWP( 0,$tmp2,"",0)); # ctx->A
&add($C, $tmp1); &add($C, $tmp1);
&mov($tmp1, &swtmp(1+17)); # $b &mov($tmp1, &swtmp(16+1)); # $b
&add($C, $tmp1); &add($C, $tmp1);
&mov($tmp1, &wparam(2));
&mov(&DWP( 0,$tmp2,"",0), $D); &mov(&DWP( 0,$tmp2,"",0), $D);
&mov(&DWP( 4,$tmp2,"",0), $E); &mov(&DWP( 4,$tmp2,"",0), $E);
&mov(&DWP( 8,$tmp2,"",0), $A); &mov(&DWP( 8,$tmp2,"",0), $A);
&mov(&DWP(12,$tmp2,"",0), $B); &sub($tmp1,1);
&mov(&DWP(16,$tmp2,"",0), $C); &mov(&DWP(12,$tmp2,"",0), $B);
&mov(&DWP(16,$tmp2,"",0), $C);
&mov($tmp2, &swtmp(0)); &jle(&label("get_out"));
&mov($tmp1, &wparam(1));
&cmp($tmp2,$tmp1); &mov(&wparam(2),$tmp1);
&mov($tmp2, &wparam(0)); &mov($C, $A);
&mov($tmp1, &wparam(1));
&mov($A, $D);
&add($tmp1, 64);
&mov($B, $E);
&mov(&wparam(1),$tmp1);
# XXX &jmp(&label("start"));
&jge(&label("start"));
&stack_pop(16+5+1); &set_label("get_out");
&stack_pop(16+5+6);
&pop("ebx"); &pop("ebx");
&pop("ebp"); &pop("ebp");