Minor performance improvements to x86-mont.pl.
This commit is contained in:
parent
8f2d60ec26
commit
4cfe3df1f5
@ -24,7 +24,7 @@
|
|||||||
#
|
#
|
||||||
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
|
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
|
||||||
# Integer-only code [being equipped with dedicated squaring procedure]
|
# Integer-only code [being equipped with dedicated squaring procedure]
|
||||||
# gives >=30% on rsa512 sign benchmark...
|
# gives ~40% on rsa512 sign benchmark...
|
||||||
|
|
||||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||||
push(@INC,"${dir}","${dir}../../perlasm");
|
push(@INC,"${dir}","${dir}../../perlasm");
|
||||||
@ -46,12 +46,12 @@ $rp="edi"; $bp="edi"; # overlapping variables!!!
|
|||||||
$np="ebp";
|
$np="ebp";
|
||||||
$num="ebx";
|
$num="ebx";
|
||||||
|
|
||||||
$_rp=&DWP(4*0,"esp"); # stack top layout
|
$_num=&DWP(4*0,"esp"); # stack top layout
|
||||||
$_ap=&DWP(4*1,"esp");
|
$_rp=&DWP(4*1,"esp");
|
||||||
$_bp=&DWP(4*2,"esp");
|
$_ap=&DWP(4*2,"esp");
|
||||||
$_np=&DWP(4*3,"esp");
|
$_bp=&DWP(4*3,"esp");
|
||||||
$_n0=&DWP(4*4,"esp");
|
$_np=&DWP(4*4,"esp");
|
||||||
$_num=&DWP(4*5,"esp");
|
$_n0=&DWP(4*5,"esp");
|
||||||
$_sp=&DWP(4*6,"esp");
|
$_sp=&DWP(4*6,"esp");
|
||||||
$_bpend=&DWP(4*7,"esp");
|
$_bpend=&DWP(4*7,"esp");
|
||||||
$frame=32; # size of above frame rounded up to 16n
|
$frame=32; # size of above frame rounded up to 16n
|
||||||
@ -61,20 +61,37 @@ $frame=32; # size of above frame rounded up to 16n
|
|||||||
&cmp ("edi",4);
|
&cmp ("edi",4);
|
||||||
&jl (&label("just_leave"));
|
&jl (&label("just_leave"));
|
||||||
|
|
||||||
################################# load argument block...
|
&lea ("esi",&wparam(0)); # put aside pointer to argument block
|
||||||
&mov ("eax",&wparam(0)); # BN_ULONG *rp
|
&lea ("edx",&wparam(1)); # load ap
|
||||||
&mov ("ebx",&wparam(1)); # const BN_ULONG *ap
|
|
||||||
&mov ("ecx",&wparam(2)); # const BN_ULONG *bp
|
|
||||||
&mov ("edx",&wparam(3)); # const BN_ULONG *np
|
|
||||||
&mov ("esi",&wparam(4)); # const BN_ULONG *n0
|
|
||||||
#&mov ("edi",&wparam(5)); # int num
|
|
||||||
|
|
||||||
&mov ("ebp","esp"); # saved stack pointer!
|
&mov ("ebp","esp"); # saved stack pointer!
|
||||||
&add ("edi",2); # extra two words on top of tp
|
&add ("edi",2); # extra two words on top of tp
|
||||||
&neg ("edi");
|
&neg ("edi");
|
||||||
&lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
|
&lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
|
||||||
&neg ("edi");
|
&neg ("edi");
|
||||||
&and ("esp",-4096); # minimize TLB utilization
|
|
||||||
|
# minimize cache contention by arraning 2K window between stack
|
||||||
|
# pointer and ap argument [np is also position sensitive vector,
|
||||||
|
# but it's assumed to be near ap, as it's allocated at ~same
|
||||||
|
# time].
|
||||||
|
&mov ("eax","esp");
|
||||||
|
&sub ("eax","edx");
|
||||||
|
&and ("eax",2047);
|
||||||
|
&sub ("esp","eax"); # this aligns sp and ap modulo 2048
|
||||||
|
|
||||||
|
&xor ("edx","esp");
|
||||||
|
&and ("edx",2048);
|
||||||
|
&xor ("edx",2048);
|
||||||
|
&sub ("esp","edx"); # this splits them apart modulo 4096
|
||||||
|
|
||||||
|
&and ("esp",-64); # align to cache line
|
||||||
|
|
||||||
|
################################# load argument block...
|
||||||
|
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
|
||||||
|
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
|
||||||
|
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
|
||||||
|
&mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
|
||||||
|
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
|
||||||
|
#&mov ("edi",&DWP(5*4,"esi"));# int num
|
||||||
|
|
||||||
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
|
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
|
||||||
&mov ($_rp,"eax"); # ... save a copy of argument block
|
&mov ($_rp,"eax"); # ... save a copy of argument block
|
||||||
@ -131,7 +148,7 @@ $mask="mm7";
|
|||||||
&psrlq ($car1,32);
|
&psrlq ($car1,32);
|
||||||
|
|
||||||
&inc ($j); # j++
|
&inc ($j); # j++
|
||||||
&set_label("1st");
|
&set_label("1st",16);
|
||||||
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
|
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
|
||||||
&pmuludq($acc1,$mul1); # np[j]*m1
|
&pmuludq($acc1,$mul1); # np[j]*m1
|
||||||
&paddq ($car0,$acc0); # +=c0
|
&paddq ($car0,$acc0); # +=c0
|
||||||
@ -250,11 +267,11 @@ if (0) {
|
|||||||
&xor ("eax","eax"); # signal "not fast enough [yet]"
|
&xor ("eax","eax"); # signal "not fast enough [yet]"
|
||||||
&jmp (&label("just_leave"));
|
&jmp (&label("just_leave"));
|
||||||
# While the below code provides competitive performance for
|
# While the below code provides competitive performance for
|
||||||
# all key lengthes on modern cores, it's still a tad slower
|
# all key lengthes on modern Intel cores, it's still more
|
||||||
# for >=2048-bits keys on *elder* CPUs:-( "Competitive" means
|
# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
|
||||||
# compared to the original integer-only assembler. 512-bit
|
# means compared to the original integer-only assembler.
|
||||||
# RSA sign is better by >=30%, but that's about all one can
|
# 512-bit RSA sign is better by ~40%, but that's about all
|
||||||
# say about all CPUs...
|
# one can say about all CPUs...
|
||||||
} else {
|
} else {
|
||||||
$inp="esi"; # integer path uses these registers differently
|
$inp="esi"; # integer path uses these registers differently
|
||||||
$word="edi";
|
$word="edi";
|
||||||
@ -496,13 +513,13 @@ $sbit=$num;
|
|||||||
&mov ($carry,"edx");
|
&mov ($carry,"edx");
|
||||||
&mul ($word); # ap[j]*ap[i]
|
&mul ($word); # ap[j]*ap[i]
|
||||||
&add ("eax",$carry);
|
&add ("eax",$carry);
|
||||||
&lea ($j,&DWP(1,$j));
|
&lea ($carry,&DWP(0,"eax","eax"));
|
||||||
&adc ("edx",0);
|
&adc ("edx",0);
|
||||||
&lea ($carry,&DWP(0,$sbit,"eax",2));
|
|
||||||
&shr ("eax",31);
|
&shr ("eax",31);
|
||||||
&cmp ($carry,$sbit);
|
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
|
||||||
|
&lea ($j,&DWP(1,$j));
|
||||||
&adc ("eax",0);
|
&adc ("eax",0);
|
||||||
&add ($carry,&DWP($frame-4,"esp",$j,4)); # +=tp[j]
|
&add ($carry,$sbit);
|
||||||
&adc ("eax",0);
|
&adc ("eax",0);
|
||||||
&cmp ($j,$_num);
|
&cmp ($j,$_num);
|
||||||
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
||||||
|
Loading…
x
Reference in New Issue
Block a user