diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl index 8d01b7a87..01eb1473d 100755 --- a/crypto/bn/asm/x86-mont.pl +++ b/crypto/bn/asm/x86-mont.pl @@ -2,8 +2,9 @@ # ==================================================================== # Written by Andy Polyakov for the OpenSSL -# project. Rights for redistribution and usage in source and binary -# forms are granted according to the OpenSSL license. +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # October 2005 @@ -31,12 +32,12 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &function_begin("bn_mul_mont",$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); -$i="ebx"; +$i="edx"; $j="ecx"; $ap="esi"; $rp="edi"; $bp="edi"; # overlapping variables!!! -$np="edx"; -$num="ebp"; +$np="ebp"; +$num="ebx"; $_rp=&DWP(4*0,"esp"); # stack top layout $_ap=&DWP(4*1,"esp"); @@ -45,9 +46,41 @@ $_np=&DWP(4*3,"esp"); $_n0=&DWP(4*4,"esp"); $_num=&DWP(4*5,"esp"); $_sp=&DWP(4*6,"esp"); +$_bpend=&DWP(4*7,"esp"); $frame=32; # size of above frame rounded up to 16n -$acc0="mm0"; # mmx register bank layout + &xor ("eax","eax"); + &mov ("edi",&wparam(5)); # int num + &cmp ("edi",3); + &jb (&label("just_leave")); + + ################################# load argument block... + &mov ("eax",&wparam(0)); # BN_ULONG *rp + &mov ("ebx",&wparam(1)); # const BN_ULONG *ap + &mov ("ecx",&wparam(2)); # const BN_ULONG *bp + &mov ("edx",&wparam(3)); # const BN_ULONG *np + &mov ("esi",&wparam(4)); # const BN_ULONG *n0 + #&mov ("edi",&wparam(5)); # int num + + &mov ("ebp","esp"); # saved stack pointer! + &add ("edi",2); # extra two words on top of tp + &neg ("edi"); + &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) + &neg ("edi"); + &and ("esp",-4096); # minimize TLB utilization + + &mov ("esi",&DWP(0,"esi")); # pull n0[0] + &mov ($_rp,"eax"); # ... save a copy of argument block + &mov ($_ap,"ebx"); + &mov ($_bp,"ecx"); + &mov ($_np,"edx"); + &mov ($_n0,"esi"); + &lea ($num,&DWP(-2,"edi")); # num is restored to its original value + #&mov ($_num,$num); # redundant as $num is not reused + &mov ($_sp,"ebp"); # saved stack pointer! + +if($sse2) { +$acc0="mm0"; # mmx register bank layout $acc1="mm1"; $car0="mm2"; $car1="mm3"; @@ -56,37 +89,10 @@ $mul1="mm5"; $temp="mm6"; $mask="mm7"; -if($sse2) { &picmeup("eax","OPENSSL_ia32cap_P"); &bt (&DWP(0,"eax"),26); &jnc (&label("non_sse2")); - ################################# load argument block... - &mov ("eax",&wparam(0)); # BN_ULONG *rp - &mov ("ebx",&wparam(1)); # const BN_ULONG *ap - &mov ("ecx",&wparam(2)); # const BN_ULONG *bp - &mov ("edx",&wparam(3)); # const BN_ULONG *np - &mov ("esi",&wparam(4)); # const BN_ULONG *n0 - &mov ($num,&wparam(5)); # int num - - &mov ("edi","esp"); # saved stack pointer! - &add ($num,1); # extra word on top of tp - &neg ($num); - &lea ("esp",&DWP(-$frame,"esp",$num,4)); # alloca($frame+8*($num+1)) - &neg ($num); - &and ("esp",-1024); # minimize TLB utilization - &sub ($num,1); # num is restored to its original value - # and will remain constant from now... - - &mov ("esi",&DWP(0,"esi")); # pull n0[0] - &mov ($_rp,"eax"); # ... save a copy of argument block - &mov ($_ap,"ebx"); - &mov ($_bp,"ecx"); - &mov ($_np,"edx"); - &mov ($_n0,"esi"); - #&mov ($_num,$num); # redundant in sse2 context - &mov ($_sp,"edi"); # saved stack pointer! - &mov ("eax",-1); &movd ($mask,"eax"); # mask 32 lower bits @@ -195,7 +201,153 @@ if($sse2) { &jl (&label("outer")); &emms (); # done with mmx bank + &jmp (&label("common_tail")); +&set_label("non_sse2",16); +} + +if (1) { + &mov ("esp",$_sp); + &xor ("eax","eax"); # signal "not fast enough [yet]" + &jmp (&label("just_leave")); + # The code below gives ~15% improvement on 512-bit benchmark + # *only*:-( On all other key lengths it's slower for up to 20%. + # This is because the original code path holds down the overall + # amount of multiplications by ~25% by deploying bn_sqr_words. + # In other words, for the code below to be competitive, + # dedicated squaring procedure is a must... +} else { +$inp="esi"; # integer path uses these registers differently +$word="edi"; +$carry="ebp"; + + &sub ($num,1); # non-SSE2 path uses num-1 + + &mov ($inp,$_ap); + &mov ($word,$_bp); + &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] + &mov ($word,&DWP(0,$word)); # bp[0] + &mov ($_bpend,"eax"); + &xor ($j,$j); + &xor ("edx","edx"); + +&set_label("mull",16); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] + &mov ($carry,"edx"); + &mul ($word); # ap[j]*bp[0] + &lea ($j,&DWP(1,$j)); + &add ("eax",$carry); + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$j,4),"eax"); # tp[j]= + &cmp ($j,$num); + &jb (&label("mull")); + + &mov ("eax",&DWP(0,$inp,$num,4)); # ap[num-1] + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*bp[0] + &add ("eax",$carry); + &adc ("edx",0); + + &mov ($word,$_n0); + &mov ($inp,$_np); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= + &xor ($j,$j); + &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= + &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= + + &mov ("eax",&DWP(0,$inp)); # np[0] + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &adc ("edx",0); + &mov ($j,1); + + &jmp (&label("2ndmadd")); + +&set_label("1stmadd",16); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] + &mov ($carry,"edx"); + &mul ($word); # ap[j]*bp[i] + &lea ($j,&DWP(1,$j)); + &add ("eax",&DWP($frame-4,"esp",$j,4)); # +=tp[j] + &adc ("edx",0); + &add ("eax",$carry); + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$j,4),"eax"); # tp[j]= + &cmp ($j,$num); + &jb (&label("1stmadd")); + + &mov ("eax",&DWP(0,$inp,$num,4)); # ap[num-1] + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*bp[i] + &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &adc ("edx",0); + &add ("eax",$carry); + &adc ("edx",0); + + &mov ($word,$_n0); + &mov ($inp,$_np); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &xor ($j,$j); + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= + &adc ($j,0); + &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= + &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= + + &mov ("eax",&DWP(0,$inp)); # np[0] + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &adc ("edx",0); + &mov ($j,1); + +&set_label("2ndmadd",16); + &mov ("eax",&DWP(0,$inp,$j,4)); # np[j] + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &lea ($j,&DWP(1,$j)); + &add ("eax",&DWP($frame-4,"esp",$j,4)); # +=tp[j] + &adc ("edx",0); + &add ("eax",$carry); + &adc ("edx",0); + &mov (&DWP($frame-8,"esp",$j,4),"eax"); # tp[j-1]= + &cmp ($j,$num); + &jb (&label("2ndmadd")); + + &mov ("eax",&DWP(0,$inp,$num,4)); # np[num-1] + &mov ($carry,"edx"); + &mul ($word); # np[num-1]*m + &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &adc ("edx",0); + &add ("eax",$carry); + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$num,4),"eax"); # tp[num-2]= + + &xor ("eax","eax"); + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] + &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= + &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= + + &mov ($carry,$_bp); # &bp[i] + &add ($carry,4); + &cmp ($carry,$_bpend); + &je (&label("x86done")); + &mov ($word,&DWP(0,$carry)); # bp[i] + &mov ($inp,$_ap); + &mov ($_bp,$carry); # &bp[++i] + &xor ($j,$j); + &xor ("edx","edx"); + &jmp (&label("1stmadd")); + +&set_label("x86done",16); + &mov ($np,$_np); # make adjustments for tail processing + &add ($num,1); +} + +&set_label("common_tail",16); &mov ("esi",&DWP($frame,"esp",$num,4));# load upmost overflow bit &mov ($rp,$_rp); # load result pointer # [$ap and $bp are zapped] @@ -206,15 +358,15 @@ if($sse2) { &mov ("eax",&DWP($frame,"esp",$j,4)); &cmp ("eax",&DWP(0,$np,$j,4)); # tp[num-1]-np[num-1]? &jae (&label("sub")); # if taken CF is cleared -&set_label("copy"); +&set_label("copy",16); &mov ("eax",&DWP($frame,"esp",$j,4)); &mov (&DWP(0,$rp,$j,4),"eax"); # rp[i]=tp[i] &mov (&DWP($frame,"esp",$j,4),$j); # zap temporary vector &dec ($j); &jge (&label("copy")); - &jmp (&label("exit_sse2")); + &jmp (&label("exit")); -&set_label("sub",4); +&set_label("sub",16); &mov ("eax",&DWP($frame,"esp",$i,4)); &sbb ("eax",&DWP(0,$np,$i,4)); &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] @@ -224,21 +376,15 @@ if($sse2) { &lea ($j,&DWP(-1,$num)); # j=num-1 &sbb ("esi",0); # esi holds upmost overflow bit &jc (&label("copy")); -&set_label("zap"); +&set_label("zap",16); &mov (&DWP($frame,"esp",$j,4),$i); # zap temporary vector &dec ($j); &jge (&label("zap")); -&set_label("exit_sse2"); +&set_label("exit",4); &mov ("esp",$_sp); # pull saved stack pointer &mov ("eax",1); - &jmp (&label("leave")); -&set_label("non_sse2"); -} - - &xor ("eax","eax"); # zero signals "not implemented [yet]" - -&set_label("leave"); +&set_label("just_leave"); &function_end("bn_mul_mont"); &asm_finish();