Unobtrusive backport of 32-bit x86 Montgomery improvements from 0.9.9-dev:

you need to use "enable-montasm" to see a difference.  (Huge speed
advantage, but BN_MONT_CTX is not binary compatible, so this can't be
enabled by default in the 0.9.8 branch.)

The CHANGES entry also covers the 64-bit x86 backport in November 2007
by appro.
This commit is contained in:
Bodo Möller
2008-05-01 23:11:34 +00:00
parent db533c96e3
commit 812d8a176c
10 changed files with 858 additions and 4 deletions

22
CHANGES
View File

@@ -4,6 +4,28 @@
Changes between 0.9.8g and 0.9.8h [xx XXX xxxx]
*) Partial backport from 0.9.9-dev:
New candidate for BIGNUM assembler implementation, bn_mul_mont,
dedicated Montgomery multiplication procedure, is introduced.
While 0.9.9-dev has assembler for various architectures, here
in the 0.9.8 branch, only x86_64 is available by default.
With Configure option "enable-montasm" (which exists only for
this backport), the 32-bit x86 assembler implementation can be
activated at compile-time. In 0.9.9-dev, BN_MONT_CTX is modified
to allow bn_mul_mont to reach for higher "64-bit" performance on
certain 32-bit targets. With "enable-montasm", this BN_MONT_CTX
change is activated in the 0.9.8 branch.
Warning: Using "enable-montasm" thus means losing binary
compatibility between patchlevels! (I.e., applications will
have to be recompiled to match the particular library.)
So you may want to avoid this setting for shared libraries.
Use at your own risk.
[Andy Polyakov (32-bit x86 backport: Bodo Moeller)]
*) Add TLS session ticket callback. This allows an application to set
TLS ticket cipher and HMAC keys rather than relying on hardcoded fixed
values. This is useful for key rollover for example where several key

View File

@@ -10,7 +10,7 @@ use strict;
# see INSTALL for instructions.
my $usage="Usage: Configure [no-<cipher> ...] [enable-<cipher> ...] [-Dxxx] [-lxxx] [-Lxxx] [-fxxx] [-Kxxx] [no-hw-xxx|no-hw] [[no-]threads] [[no-]shared] [[no-]zlib|zlib-dynamic] [no-asm] [no-dso] [no-krb5] [386] [--prefix=DIR] [--openssldir=OPENSSLDIR] [--with-xxx[=vvv]] [--test-sanity] os/compiler[:flags]\n";
my $usage="Usage: Configure [no-<cipher> ...] [enable-<cipher> ...] [-Dxxx] [-lxxx] [-Lxxx] [-fxxx] [-Kxxx] [no-hw-xxx|no-hw] [[no-]threads] [[no-]shared] [[no-]zlib|zlib-dynamic] [enable-montasm] [no-asm] [no-dso] [no-krb5] [386] [--prefix=DIR] [--openssldir=OPENSSLDIR] [--with-xxx[=vvv]] [--test-sanity] os/compiler[:flags]\n";
# Options:
#
@@ -54,6 +54,8 @@ my $usage="Usage: Configure [no-<cipher> ...] [enable-<cipher> ...] [-Dxxx] [-lx
# [no-]zlib [don't] compile support for zlib compression.
# zlib-dynamic Like "zlib", but the zlib library is expected to be a shared
# library and will be loaded in run-time by the OpenSSL library.
# enable-montasm 0.9.8 branch only: enable Montgomery x86 assembler backport
# from 0.9.9
# 386 generate 80386 code
# no-sse2 disables IA-32 SSE2 code, above option implies no-sse2
# no-<cipher> build without specified algorithm (rsa, idea, rc5, ...)
@@ -114,9 +116,9 @@ my $tlib="-lnsl -lsocket";
my $bits1="THIRTY_TWO_BIT ";
my $bits2="SIXTY_FOUR_BIT ";
my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o rc4_skey.o:rm86-elf.o:r586-elf.o";
my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o rc4_skey.o:rm86-cof.o:r586-cof.o";
my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o rc4_skey.o:rm86-out.o:r586-out.o";
my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o MAYBE-MO86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o rc4_skey.o:rm86-elf.o:r586-elf.o";
my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o MAYBE-MO86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o rc4_skey.o:rm86-cof.o:r586-cof.o";
my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o MAYBE-MO86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o rc4_skey.o:rm86-out.o:r586-out.o";
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o::";
my $ia64_asm=":bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o:::sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o::";
@@ -580,6 +582,7 @@ my $no_shared=0; # but "no-shared" is default
my $zlib=1; # but "no-zlib" is default
my $no_krb5=0; # but "no-krb5" is implied unless "--with-krb5-..." is used
my $no_rfc3779=1; # but "no-rfc3779" is default
my $montasm=1; # but "no-montasm" is default
my $no_asm=0;
my $no_dso=0;
my $no_gmp=0;
@@ -616,6 +619,7 @@ my %disabled = ( # "what" => "comment"
"cms" => "default",
"gmp" => "default",
"mdc2" => "default",
"montasm" => "default", # explicit option in 0.9.8 only (implicitly enabled in 0.9.9)
"rc5" => "default",
"rfc3779" => "default",
"seed" => "default",
@@ -895,6 +899,8 @@ foreach (sort (keys %disabled))
{ $no_shared = 1; }
elsif (/^zlib$/)
{ $zlib = 0; }
elsif (/^montasm$/)
{ $montasm = 0; }
elsif (/^static-engine$/)
{ }
elsif (/^zlib-dynamic$/)
@@ -1121,6 +1127,14 @@ if ($no_asm)
$cpuid_obj=$bn_obj=$des_obj=$aes_obj=$bf_obj=$cast_obj=$rc4_obj=$rc5_obj="";
$sha1_obj=$md5_obj=$rmd160_obj="";
}
if ($montasm)
{
$bn_obj =~ s/MAYBE-MO86-/mo86-/;
}
else
{
$bn_obj =~ s/MAYBE-MO86-[a-z.]*//;
}
if (!$no_shared)
{

View File

@@ -4,3 +4,4 @@ Makefile.save
semantic.cache
co86-elf.s
bn86-elf.s
mo86-elf.s

View File

@@ -67,16 +67,22 @@ bn86-elf.s: asm/bn-586.pl ../perlasm/x86asm.pl
(cd asm; $(PERL) bn-586.pl elf $(CFLAGS) > ../$@)
co86-elf.s: asm/co-586.pl ../perlasm/x86asm.pl
(cd asm; $(PERL) co-586.pl elf $(CFLAGS) > ../$@)
mo86-elf.s: asm/mo-586.pl ../perlasm/x86asm.pl
(cd asm; $(PERL) mo-586.pl elf $(CFLAGS) > ../$@)
# COFF
bn86-cof.s: asm/bn-586.pl ../perlasm/x86asm.pl
(cd asm; $(PERL) bn-586.pl coff $(CFLAGS) > ../$@)
co86-cof.s: asm/co-586.pl ../perlasm/x86asm.pl
(cd asm; $(PERL) co-586.pl coff $(CFLAGS) > ../$@)
mo86-cof.s: asm/mo-586.pl ../perlasm/x86asm.pl
(cd asm; $(PERL) mo-586.pl coff $(CFLAGS) > ../$@)
# a.out
bn86-out.s: asm/bn-586.pl ../perlasm/x86asm.pl
(cd asm; $(PERL) bn-586.pl a.out $(CFLAGS) > ../$@)
co86-out.s: asm/co-586.pl ../perlasm/x86asm.pl
(cd asm; $(PERL) co-586.pl a.out $(CFLAGS) > ../$@)
mo86-out.s: asm/mo-586.pl ../perlasm/x86asm.pl
(cd asm; $(PERL) mo-586.pl a.out $(CFLAGS) > ../$@)
sparcv8.o: asm/sparcv8.S
$(CC) $(CFLAGS) -c asm/sparcv8.S

603
crypto/bn/asm/mo-586.pl Normal file
View File

@@ -0,0 +1,603 @@
#!/usr/bin/env perl
# This is crypto/bn/asm/x86-mont.pl (with asciz from crypto/perlasm/x86asm.pl)
# from OpenSSL 0.9.9-dev
sub ::asciz
{ my @str=unpack("C*",shift);
push @str,0;
while ($#str>15) {
&data_byte(@str[0..15]);
foreach (0..15) { shift @str; }
}
&data_byte(@str) if (@str);
}
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2005
#
# This is a "teaser" code, as it can be improved in several ways...
# First of all non-SSE2 path should be implemented (yes, for now it
# performs Montgomery multiplication/convolution only on SSE2-capable
# CPUs such as P4, others fall down to original code). Then inner loop
# can be unrolled and modulo-scheduled to improve ILP and possibly
# moved to 128-bit XMM register bank (though it would require input
# rearrangement and/or increase bus bandwidth utilization). Dedicated
# squaring procedure should give further performance improvement...
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
# December 2006
#
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
# Integer-only code [being equipped with dedicated squaring procedure]
# gives ~40% on rsa512 sign benchmark...
push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
&function_begin("bn_mul_mont");
$i="edx";
$j="ecx";
$ap="esi"; $tp="esi"; # overlapping variables!!!
$rp="edi"; $bp="edi"; # overlapping variables!!!
$np="ebp";
$num="ebx";
$_num=&DWP(4*0,"esp"); # stack top layout
$_rp=&DWP(4*1,"esp");
$_ap=&DWP(4*2,"esp");
$_bp=&DWP(4*3,"esp");
$_np=&DWP(4*4,"esp");
$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
$_sp=&DWP(4*6,"esp");
$_bpend=&DWP(4*7,"esp");
$frame=32; # size of above frame rounded up to 16n
&xor ("eax","eax");
&mov ("edi",&wparam(5)); # int num
&cmp ("edi",4);
&jl (&label("just_leave"));
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
&mov ("ebp","esp"); # saved stack pointer!
&add ("edi",2); # extra two words on top of tp
&neg ("edi");
&lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
&neg ("edi");
# minimize cache contention by arraning 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
&mov ("eax","esp");
&sub ("eax","edx");
&and ("eax",2047);
&sub ("esp","eax"); # this aligns sp and ap modulo 2048
&xor ("edx","esp");
&and ("edx",2048);
&xor ("edx",2048);
&sub ("esp","edx"); # this splits them apart modulo 4096
&and ("esp",-64); # align to cache line
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
&mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
&mov ($_rp,"eax"); # ... save a copy of argument block
&mov ($_ap,"ebx");
&mov ($_bp,"ecx");
&mov ($_np,"edx");
&mov ($_n0,"esi");
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
#&mov ($_num,$num); # redundant as $num is not reused
&mov ($_sp,"ebp"); # saved stack pointer!
if($sse2) {
$acc0="mm0"; # mmx register bank layout
$acc1="mm1";
$car0="mm2";
$car1="mm3";
$mul0="mm4";
$mul1="mm5";
$temp="mm6";
$mask="mm7";
&picmeup("eax","OPENSSL_ia32cap_P");
&bt (&DWP(0,"eax"),26);
&jnc (&label("non_sse2"));
&mov ("eax",-1);
&movd ($mask,"eax"); # mask 32 lower bits
&mov ($ap,$_ap); # load input pointers
&mov ($bp,$_bp);
&mov ($np,$_np);
&xor ($i,$i); # i=0
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp)); # bp[0]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[0]
&movq ($car0,$mul1);
&movq ($acc0,$mul1); # I wish movd worked for
&pand ($acc0,$mask); # inter-register transfers
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
&paddq ($car1,$acc0);
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&inc ($j); # j++
&set_label("1st",16);
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[0];
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
&psrlq ($car1,32);
&lea ($j,&DWP(1,$j));
&cmp ($j,$num);
&jl (&label("1st"));
&pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car1,$car0);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&inc ($i); # i++
&set_label("outer");
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($temp,&DWP($frame,"esp")); # tp[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[i]
&paddq ($mul1,$temp); # +=tp[0]
&movq ($acc0,$mul1);
&movq ($car0,$mul1);
&pand ($acc0,$mask);
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1);
&paddq ($car1,$acc0);
&movd ($temp,&DWP($frame+4,"esp")); # tp[1]
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[1]
&inc ($j); # j++
&dec ($num);
&set_label("inner");
&pmuludq($acc0,$mul0); # ap[j]*bp[i]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[j+1]
&dec ($num);
&lea ($j,&DWP(1,$j)); # j++
&jnz (&label("inner"));
&mov ($num,$j);
&pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
&paddq ($car1,$car0);
&paddq ($car1,$temp);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&lea ($i,&DWP(1,$i)); # i++
&cmp ($i,$num);
&jle (&label("outer"));
&emms (); # done with mmx bank
&jmp (&label("common_tail"));
&set_label("non_sse2",16);
}
if (0) {
&mov ("esp",$_sp);
&xor ("eax","eax"); # signal "not fast enough [yet]"
&jmp (&label("just_leave"));
# While the below code provides competitive performance for
# all key lengthes on modern Intel cores, it's still more
# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
# means compared to the original integer-only assembler.
# 512-bit RSA sign is better by ~40%, but that's about all
# one can say about all CPUs...
} else {
$inp="esi"; # integer path uses these registers differently
$word="edi";
$carry="ebp";
&mov ($inp,$_ap);
&lea ($carry,&DWP(1,$num));
&mov ($word,$_bp);
&xor ($j,$j); # j=0
&mov ("edx",$inp);
&and ($carry,1); # see if num is even
&sub ("edx",$word); # see if ap==bp
&lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
&or ($carry,"edx");
&mov ($word,&DWP(0,$word)); # bp[0]
&jz (&label("bn_sqr_mont"));
&mov ($_bpend,"eax");
&mov ("eax",&DWP(0,$inp));
&xor ("edx","edx");
&set_label("mull",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[0]
&add ($carry,"eax");
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("mull"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[0]
&mov ($word,$_n0);
&add ("eax",$carry);
&mov ($inp,$_np);
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
&xor ($j,$j);
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mov ("eax",&DWP(0,$inp)); # np[0]
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&inc ($j);
&jmp (&label("2ndmadd"));
&set_label("1stmadd",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[i]
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("1stmadd"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[i]
&add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&add ($carry,"eax");
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&xor ($j,$j);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
&adc ($j,0);
&mov ("eax",&DWP(0,$inp)); # np[0]
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&mov ($j,1);
&set_label("2ndmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
&jl (&label("2ndmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&xor ("eax","eax");
&mov ($j,$_bp); # &bp[i]
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&lea ($j,&DWP(4,$j));
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$_bpend);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(0,$j)); # bp[i+1]
&mov ($inp,$_ap);
&mov ($_bp,$j); # &bp[++i]
&xor ($j,$j);
&xor ("edx","edx");
&mov ("eax",&DWP(0,$inp));
&jmp (&label("1stmadd"));
&set_label("bn_sqr_mont",16);
$sbit=$num;
&mov ($_num,$num);
&mov ($_bp,$j); # i=0
&mov ("eax",$word); # ap[0]
&mul ($word); # ap[0]*ap[0]
&mov (&DWP($frame,"esp"),"eax"); # tp[0]=
&mov ($sbit,"edx");
&shr ("edx",1);
&and ($sbit,1);
&inc ($j);
&set_label("sqr",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[0]
&add ("eax",$carry);
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&shr ("eax",31);
&cmp ($j,$_num);
&mov ($sbit,"eax");
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("sqr"));
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*ap[0]
&add ("eax",$carry);
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&shr ("eax",31);
&mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
&lea ($carry,&DWP(0,"eax","edx",2));
&mov ("eax",&DWP(0,$inp)); # np[0]
&shr ("edx",31);
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
&mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ($num,$j);
&adc ("edx",0);
&mov ("eax",&DWP(4,$inp)); # np[1]
&mov ($j,1);
&set_label("3rdmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
&mov ($carry,"edx");
&mul ($word); # np[j+1]*m
&add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
&lea ($j,&DWP(2,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
&jl (&label("3rdmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&mov ($j,$_bp); # i
&xor ("eax","eax");
&mov ($inp,$_ap);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$num);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
&lea ($j,&DWP(1,$j));
&mov ("eax",$word);
&mov ($_bp,$j); # ++i
&mul ($word); # ap[i]*ap[i]
&add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
&adc ("edx",0);
&mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
&xor ($carry,$carry);
&cmp ($j,$num);
&lea ($j,&DWP(1,$j));
&je (&label("sqrlast"));
&mov ($sbit,"edx"); # zaps $num
&shr ("edx",1);
&and ($sbit,1);
&set_label("sqradd",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[i]
&add ("eax",$carry);
&lea ($carry,&DWP(0,"eax","eax"));
&adc ("edx",0);
&shr ("eax",31);
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("eax",0);
&add ($carry,$sbit);
&adc ("eax",0);
&cmp ($j,$_num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&mov ($sbit,"eax");
&jle (&label("sqradd"));
&mov ($carry,"edx");
&lea ("edx",&DWP(0,$sbit,"edx",2));
&shr ($carry,31);
&set_label("sqrlast");
&mov ($word,$_n0);
&mov ($inp,$_np);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
&mov ("eax",&DWP(0,$inp)); # np[0]
&adc ($carry,0);
&mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&lea ($num,&DWP(-1,$j));
&adc ("edx",0);
&mov ($j,1);
&mov ("eax",&DWP(4,$inp)); # np[1]
&jmp (&label("3rdmadd"));
}
&set_label("common_tail",16);
&mov ($np,$_np); # load modulus pointer
&mov ($rp,$_rp); # load result pointer
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
&mov ("eax",&DWP(0,$tp)); # tp[0]
&mov ($j,$num); # j=num-1
&xor ($i,$i); # i=0 and clear CF!
&set_label("sub",16);
&sbb ("eax",&DWP(0,$np,$i,4));
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
&dec ($j); # doesn't affect CF!
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
&lea ($i,&DWP(1,$i)); # i++
&jge (&label("sub"));
&sbb ("eax",0); # handle upmost overflow bit
&and ($tp,"eax");
&not ("eax");
&mov ($np,$rp);
&and ($np,"eax");
&or ($tp,$np); # tp=carry?tp:rp
&set_label("copy",16); # copy or in-place refresh
&mov ("eax",&DWP(0,$tp,$num,4));
&mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
&dec ($num);
&jge (&label("copy"));
&mov ("esp",$_sp); # pull saved stack pointer
&mov ("eax",1);
&set_label("just_leave");
&function_end("bn_mul_mont");
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

View File

@@ -303,7 +303,14 @@ struct bn_mont_ctx_st
BIGNUM N; /* The modulus */
BIGNUM Ni; /* R*(1/R mod N) - N*Ni = 1
* (Ni is only stored for bignum algorithm) */
#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
/* Non-default compile (can only happen with "enable-montasm"),
* uses the new type from 0.9.9 to accomodate two words: */
BN_ULONG n0[2];/* least significant word(s) of Ni */
#else
/* By default, use old type: */
BN_ULONG n0; /* least significant word of Ni */
#endif
int flags;
};

View File

@@ -122,6 +122,12 @@
#define MONT_WORD /* use the faster word-based algorithm */
#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont);
#endif
int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
BN_MONT_CTX *mont, BN_CTX *ctx)
{
@@ -133,7 +139,11 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
if (num>1 && a->top==num && b->top==num)
{
if (bn_wexpand(r,num) == NULL) return(0);
#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */
if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num))
#else
if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,&mont->n0,num))
#endif
{
r->neg = a->neg^b->neg;
r->top = num;
@@ -157,7 +167,11 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
if (!BN_mul(tmp,a,b,ctx)) goto err;
}
/* reduce from aRR to aR */
#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
if (!BN_from_montgomery_word(r,tmp,mont)) goto err;
#else
if (!BN_from_montgomery(r,tmp,mont,ctx)) goto err;
#endif
bn_check_top(r);
ret=1;
err:
@@ -165,6 +179,145 @@ err:
return(ret);
}
#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
{
BIGNUM *n;
BN_ULONG *ap,*np,*rp,n0,v,*nrp;
int al,nl,max,i,x,ri;
n= &(mont->N);
/* mont->ri is the size of mont->N in bits (rounded up
to the word size) */
al=ri=mont->ri/BN_BITS2;
nl=n->top;
if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
max=(nl+al+1); /* allow for overflow (no?) XXX */
if (bn_wexpand(r,max) == NULL) return(0);
r->neg^=n->neg;
np=n->d;
rp=r->d;
nrp= &(r->d[nl]);
/* clear the top words of T */
for (i=r->top; i<max; i++) /* memset? XXX */
r->d[i]=0;
r->top=max;
n0=mont->n0[0];
#ifdef BN_COUNT
fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
#endif
for (i=0; i<nl; i++)
{
#ifdef __TANDEM
{
long long t1;
long long t2;
long long t3;
t1 = rp[0] * (n0 & 0177777);
t2 = 037777600000l;
t2 = n0 & t2;
t3 = rp[0] & 0177777;
t2 = (t3 * t2) & BN_MASK2;
t1 = t1 + t2;
v=bn_mul_add_words(rp,np,nl,(BN_ULONG) t1);
}
#else
v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
#endif
nrp++;
rp++;
if (((nrp[-1]+=v)&BN_MASK2) >= v)
continue;
else
{
if (((++nrp[0])&BN_MASK2) != 0) continue;
if (((++nrp[1])&BN_MASK2) != 0) continue;
for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ;
}
}
bn_correct_top(r);
/* mont->ri will be a multiple of the word size and below code
* is kind of BN_rshift(ret,r,mont->ri) equivalent */
if (r->top <= ri)
{
ret->top=0;
return(1);
}
al=r->top-ri;
if (bn_wexpand(ret,ri) == NULL) return(0);
x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
ret->top=x=(ri&~x)|(al&x); /* min(ri,al) */
ret->neg=r->neg;
rp=ret->d;
ap=&(r->d[ri]);
{
size_t m1,m2;
v=bn_sub_words(rp,ap,np,ri);
/* this ----------------^^ works even in al<ri case
* thanks to zealous zeroing of top of the vector in the
* beginning. */
/* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
/* in other words if subtraction result is real, then
* trick unconditional memcpy below to perform in-place
* "refresh" instead of actual copy. */
m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1); /* al<ri */
m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1); /* al>ri */
m1|=m2; /* (al!=ri) */
m1|=(0-(size_t)v); /* (al!=ri || v) */
m1&=~m2; /* (al!=ri || v) && !al>ri */
nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1));
}
/* 'i<ri' is chosen to eliminate dependency on input data, even
* though it results in redundant copy in al<ri case. */
for (i=0,ri-=4; i<ri; i+=4)
{
BN_ULONG t1,t2,t3,t4;
t1=nrp[i+0];
t2=nrp[i+1];
t3=nrp[i+2]; ap[i+0]=0;
t4=nrp[i+3]; ap[i+1]=0;
rp[i+0]=t1; ap[i+2]=0;
rp[i+1]=t2; ap[i+3]=0;
rp[i+2]=t3;
rp[i+3]=t4;
}
for (ri+=4; i<ri; i++)
rp[i]=nrp[i], ap[i]=0;
bn_correct_top(r);
bn_correct_top(ret);
bn_check_top(ret);
return(1);
}
int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
BN_CTX *ctx)
{
int retn=0;
BIGNUM *t;
BN_CTX_start(ctx);
if ((t = BN_CTX_get(ctx)) && BN_copy(t,a))
retn = BN_from_montgomery_word(ret,t,mont);
BN_CTX_end(ctx);
return retn;
}
#else
int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
BN_CTX *ctx)
{
@@ -357,6 +510,7 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
BN_CTX_end(ctx);
return(retn);
}
#endif /* defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) */
BN_MONT_CTX *BN_MONT_CTX_new(void)
{
@@ -376,6 +530,11 @@ void BN_MONT_CTX_init(BN_MONT_CTX *ctx)
BN_init(&(ctx->RR));
BN_init(&(ctx->N));
BN_init(&(ctx->Ni));
#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */
ctx->n0[0] = ctx->n0[1] = 0;
#else
ctx->n0 = 0;
#endif
ctx->flags=0;
}
@@ -409,7 +568,11 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2;
BN_zero(R);
#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */
if (!(BN_set_bit(R,2*BN_BITS2))) goto err; /* R */
#else
if (!(BN_set_bit(R,BN_BITS2))) goto err; /* R */
#endif
buf[0]=mod->d[0]; /* tmod = N mod word size */
buf[1]=0;
@@ -419,6 +582,35 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
tmod.top = buf[0] != 0 ? 1 : 0;
tmod.dmax=2;
tmod.neg=0;
#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
tmod.top=0;
if ((buf[0] = mod->d[0])) tmod.top=1;
if ((buf[1] = mod->top>1 ? mod->d[1] : 0)) tmod.top=2;
if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
goto err;
if (!BN_lshift(Ri,Ri,2*BN_BITS2)) goto err; /* R*Ri */
if (!BN_is_zero(Ri))
{
if (!BN_sub_word(Ri,1)) goto err;
}
else /* if N mod word size == 1 */
{
if (bn_expand(Ri,(int)sizeof(BN_ULONG)*2) == NULL)
goto err;
/* Ri-- (mod double word size) */
Ri->neg=0;
Ri->d[0]=BN_MASK2;
Ri->d[1]=BN_MASK2;
Ri->top=2;
}
if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err;
/* Ni = (R*Ri-1)/N,
* keep only couple of least significant words: */
mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0;
#else
/* Ri = R^-1 mod N*/
if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
goto err;
@@ -435,6 +627,7 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
/* Ni = (R*Ri-1)/N,
* keep only least significant word: */
mont->n0 = (Ri->top > 0) ? Ri->d[0] : 0;
#endif
}
#else /* !MONT_WORD */
{ /* bignum version */
@@ -470,7 +663,12 @@ BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from)
if (!BN_copy(&(to->N),&(from->N))) return NULL;
if (!BN_copy(&(to->Ni),&(from->Ni))) return NULL;
to->ri=from->ri;
#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */
to->n0[0]=from->n0[0];
to->n0[1]=from->n0[1];
#else
to->n0=from->n0;
#endif
return(to);
}

View File

@@ -146,6 +146,7 @@ sub main'exch { &out2("xchg",@_); }
sub main'cmp { &out2("cmp",@_); }
sub main'lea { &out2("lea",@_); }
sub main'mul { &out1("mul",@_); }
sub main'imul { &out2("imul",@_); }
sub main'div { &out1("div",@_); }
sub main'dec { &out1("dec",@_); }
sub main'inc { &out1("inc",@_); }

View File

@@ -154,6 +154,7 @@ sub main'exch { &out2("xchg",@_); }
sub main'cmp { &out2("cmp",@_); }
sub main'lea { &out2("lea",@_); }
sub main'mul { &out1("mul",@_); }
sub main'imul { &out2("imul",@_); }
sub main'div { &out1("div",@_); }
sub main'dec { &out1("dec",@_); }
sub main'inc { &out1("inc",@_); }

View File

@@ -171,6 +171,7 @@ sub main'exch { &out2($_[0]=~/%[a-d][lh]/?"xchgb":"xchgl",@_); }
sub main'cmp { &out2("cmpl",@_); }
sub main'lea { &out2("leal",@_); }
sub main'mul { &out1("mull",@_); }
sub main'imul { &out2("imull",@_); }
sub main'div { &out1("divl",@_); }
sub main'jmp { &out1("jmp",@_); }
sub main'jmp_ptr { &out1p("jmp",@_); }