diff --git a/CHANGES b/CHANGES index 0af9ffe98..27c74e2ef 100644 --- a/CHANGES +++ b/CHANGES @@ -4,6 +4,28 @@ Changes between 0.9.8g and 0.9.8h [xx XXX xxxx] + *) Partial backport from 0.9.9-dev: + + New candidate for BIGNUM assembler implementation, bn_mul_mont, + dedicated Montgomery multiplication procedure, is introduced. + While 0.9.9-dev has assembler for various architectures, here + in the 0.9.8 branch, only x86_64 is available by default. + + With Configure option "enable-montasm" (which exists only for + this backport), the 32-bit x86 assembler implementation can be + activated at compile-time. In 0.9.9-dev, BN_MONT_CTX is modified + to allow bn_mul_mont to reach for higher "64-bit" performance on + certain 32-bit targets. With "enable-montasm", this BN_MONT_CTX + change is activated in the 0.9.8 branch. + + Warning: Using "enable-montasm" thus means losing binary + compatibility between patchlevels! (I.e., applications will + have to be recompiled to match the particular library.) + So you may want to avoid this setting for shared libraries. + Use at your own risk. + + [Andy Polyakov (32-bit x86 backport: Bodo Moeller)] + *) Add TLS session ticket callback. This allows an application to set TLS ticket cipher and HMAC keys rather than relying on hardcoded fixed values. This is useful for key rollover for example where several key diff --git a/Configure b/Configure index c7dd1194e..1a9a59f96 100755 --- a/Configure +++ b/Configure @@ -10,7 +10,7 @@ use strict; # see INSTALL for instructions. -my $usage="Usage: Configure [no- ...] [enable- ...] [-Dxxx] [-lxxx] [-Lxxx] [-fxxx] [-Kxxx] [no-hw-xxx|no-hw] [[no-]threads] [[no-]shared] [[no-]zlib|zlib-dynamic] [no-asm] [no-dso] [no-krb5] [386] [--prefix=DIR] [--openssldir=OPENSSLDIR] [--with-xxx[=vvv]] [--test-sanity] os/compiler[:flags]\n"; +my $usage="Usage: Configure [no- ...] [enable- ...] [-Dxxx] [-lxxx] [-Lxxx] [-fxxx] [-Kxxx] [no-hw-xxx|no-hw] [[no-]threads] [[no-]shared] [[no-]zlib|zlib-dynamic] [enable-montasm] [no-asm] [no-dso] [no-krb5] [386] [--prefix=DIR] [--openssldir=OPENSSLDIR] [--with-xxx[=vvv]] [--test-sanity] os/compiler[:flags]\n"; # Options: # @@ -54,6 +54,8 @@ my $usage="Usage: Configure [no- ...] [enable- ...] [-Dxxx] [-lx # [no-]zlib [don't] compile support for zlib compression. # zlib-dynamic Like "zlib", but the zlib library is expected to be a shared # library and will be loaded in run-time by the OpenSSL library. +# enable-montasm 0.9.8 branch only: enable Montgomery x86 assembler backport +# from 0.9.9 # 386 generate 80386 code # no-sse2 disables IA-32 SSE2 code, above option implies no-sse2 # no- build without specified algorithm (rsa, idea, rc5, ...) @@ -114,9 +116,9 @@ my $tlib="-lnsl -lsocket"; my $bits1="THIRTY_TWO_BIT "; my $bits2="SIXTY_FOUR_BIT "; -my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o rc4_skey.o:rm86-elf.o:r586-elf.o"; -my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o rc4_skey.o:rm86-cof.o:r586-cof.o"; -my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o rc4_skey.o:rm86-out.o:r586-out.o"; +my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o MAYBE-MO86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o rc4_skey.o:rm86-elf.o:r586-elf.o"; +my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o MAYBE-MO86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o rc4_skey.o:rm86-cof.o:r586-cof.o"; +my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o MAYBE-MO86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o rc4_skey.o:rm86-out.o:r586-out.o"; my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o::"; my $ia64_asm=":bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o:::sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o::"; @@ -580,6 +582,7 @@ my $no_shared=0; # but "no-shared" is default my $zlib=1; # but "no-zlib" is default my $no_krb5=0; # but "no-krb5" is implied unless "--with-krb5-..." is used my $no_rfc3779=1; # but "no-rfc3779" is default +my $montasm=1; # but "no-montasm" is default my $no_asm=0; my $no_dso=0; my $no_gmp=0; @@ -616,6 +619,7 @@ my %disabled = ( # "what" => "comment" "cms" => "default", "gmp" => "default", "mdc2" => "default", + "montasm" => "default", # explicit option in 0.9.8 only (implicitly enabled in 0.9.9) "rc5" => "default", "rfc3779" => "default", "seed" => "default", @@ -895,6 +899,8 @@ foreach (sort (keys %disabled)) { $no_shared = 1; } elsif (/^zlib$/) { $zlib = 0; } + elsif (/^montasm$/) + { $montasm = 0; } elsif (/^static-engine$/) { } elsif (/^zlib-dynamic$/) @@ -1121,6 +1127,14 @@ if ($no_asm) $cpuid_obj=$bn_obj=$des_obj=$aes_obj=$bf_obj=$cast_obj=$rc4_obj=$rc5_obj=""; $sha1_obj=$md5_obj=$rmd160_obj=""; } +if ($montasm) + { + $bn_obj =~ s/MAYBE-MO86-/mo86-/; + } +else + { + $bn_obj =~ s/MAYBE-MO86-[a-z.]*//; + } if (!$no_shared) { diff --git a/crypto/bn/.cvsignore b/crypto/bn/.cvsignore index 57df22cf6..c2f3bc085 100644 --- a/crypto/bn/.cvsignore +++ b/crypto/bn/.cvsignore @@ -4,3 +4,4 @@ Makefile.save semantic.cache co86-elf.s bn86-elf.s +mo86-elf.s diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile index 6dfd528d5..e97c75139 100644 --- a/crypto/bn/Makefile +++ b/crypto/bn/Makefile @@ -67,16 +67,22 @@ bn86-elf.s: asm/bn-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) bn-586.pl elf $(CFLAGS) > ../$@) co86-elf.s: asm/co-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) co-586.pl elf $(CFLAGS) > ../$@) +mo86-elf.s: asm/mo-586.pl ../perlasm/x86asm.pl + (cd asm; $(PERL) mo-586.pl elf $(CFLAGS) > ../$@) # COFF bn86-cof.s: asm/bn-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) bn-586.pl coff $(CFLAGS) > ../$@) co86-cof.s: asm/co-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) co-586.pl coff $(CFLAGS) > ../$@) +mo86-cof.s: asm/mo-586.pl ../perlasm/x86asm.pl + (cd asm; $(PERL) mo-586.pl coff $(CFLAGS) > ../$@) # a.out bn86-out.s: asm/bn-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) bn-586.pl a.out $(CFLAGS) > ../$@) co86-out.s: asm/co-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) co-586.pl a.out $(CFLAGS) > ../$@) +mo86-out.s: asm/mo-586.pl ../perlasm/x86asm.pl + (cd asm; $(PERL) mo-586.pl a.out $(CFLAGS) > ../$@) sparcv8.o: asm/sparcv8.S $(CC) $(CFLAGS) -c asm/sparcv8.S diff --git a/crypto/bn/asm/mo-586.pl b/crypto/bn/asm/mo-586.pl new file mode 100644 index 000000000..098229309 --- /dev/null +++ b/crypto/bn/asm/mo-586.pl @@ -0,0 +1,603 @@ +#!/usr/bin/env perl + +# This is crypto/bn/asm/x86-mont.pl (with asciz from crypto/perlasm/x86asm.pl) +# from OpenSSL 0.9.9-dev + +sub ::asciz +{ my @str=unpack("C*",shift); + push @str,0; + while ($#str>15) { + &data_byte(@str[0..15]); + foreach (0..15) { shift @str; } + } + &data_byte(@str) if (@str); +} + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# October 2005 +# +# This is a "teaser" code, as it can be improved in several ways... +# First of all non-SSE2 path should be implemented (yes, for now it +# performs Montgomery multiplication/convolution only on SSE2-capable +# CPUs such as P4, others fall down to original code). Then inner loop +# can be unrolled and modulo-scheduled to improve ILP and possibly +# moved to 128-bit XMM register bank (though it would require input +# rearrangement and/or increase bus bandwidth utilization). Dedicated +# squaring procedure should give further performance improvement... +# Yet, for being draft, the code improves rsa512 *sign* benchmark by +# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) + +# December 2006 +# +# Modulo-scheduling SSE2 loops results in further 15-20% improvement. +# Integer-only code [being equipped with dedicated squaring procedure] +# gives ~40% on rsa512 sign benchmark... + +push(@INC,"perlasm","../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],$0); + +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +&external_label("OPENSSL_ia32cap_P") if ($sse2); + +&function_begin("bn_mul_mont"); + +$i="edx"; +$j="ecx"; +$ap="esi"; $tp="esi"; # overlapping variables!!! +$rp="edi"; $bp="edi"; # overlapping variables!!! +$np="ebp"; +$num="ebx"; + +$_num=&DWP(4*0,"esp"); # stack top layout +$_rp=&DWP(4*1,"esp"); +$_ap=&DWP(4*2,"esp"); +$_bp=&DWP(4*3,"esp"); +$_np=&DWP(4*4,"esp"); +$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); +$_sp=&DWP(4*6,"esp"); +$_bpend=&DWP(4*7,"esp"); +$frame=32; # size of above frame rounded up to 16n + + &xor ("eax","eax"); + &mov ("edi",&wparam(5)); # int num + &cmp ("edi",4); + &jl (&label("just_leave")); + + &lea ("esi",&wparam(0)); # put aside pointer to argument block + &lea ("edx",&wparam(1)); # load ap + &mov ("ebp","esp"); # saved stack pointer! + &add ("edi",2); # extra two words on top of tp + &neg ("edi"); + &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) + &neg ("edi"); + + # minimize cache contention by arraning 2K window between stack + # pointer and ap argument [np is also position sensitive vector, + # but it's assumed to be near ap, as it's allocated at ~same + # time]. + &mov ("eax","esp"); + &sub ("eax","edx"); + &and ("eax",2047); + &sub ("esp","eax"); # this aligns sp and ap modulo 2048 + + &xor ("edx","esp"); + &and ("edx",2048); + &xor ("edx",2048); + &sub ("esp","edx"); # this splits them apart modulo 4096 + + &and ("esp",-64); # align to cache line + + ################################# load argument block... + &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp + &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap + &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp + &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np + &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 + #&mov ("edi",&DWP(5*4,"esi"));# int num + + &mov ("esi",&DWP(0,"esi")); # pull n0[0] + &mov ($_rp,"eax"); # ... save a copy of argument block + &mov ($_ap,"ebx"); + &mov ($_bp,"ecx"); + &mov ($_np,"edx"); + &mov ($_n0,"esi"); + &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling + #&mov ($_num,$num); # redundant as $num is not reused + &mov ($_sp,"ebp"); # saved stack pointer! + +if($sse2) { +$acc0="mm0"; # mmx register bank layout +$acc1="mm1"; +$car0="mm2"; +$car1="mm3"; +$mul0="mm4"; +$mul1="mm5"; +$temp="mm6"; +$mask="mm7"; + + &picmeup("eax","OPENSSL_ia32cap_P"); + &bt (&DWP(0,"eax"),26); + &jnc (&label("non_sse2")); + + &mov ("eax",-1); + &movd ($mask,"eax"); # mask 32 lower bits + + &mov ($ap,$_ap); # load input pointers + &mov ($bp,$_bp); + &mov ($np,$_np); + + &xor ($i,$i); # i=0 + &xor ($j,$j); # j=0 + + &movd ($mul0,&DWP(0,$bp)); # bp[0] + &movd ($mul1,&DWP(0,$ap)); # ap[0] + &movd ($car1,&DWP(0,$np)); # np[0] + + &pmuludq($mul1,$mul0); # ap[0]*bp[0] + &movq ($car0,$mul1); + &movq ($acc0,$mul1); # I wish movd worked for + &pand ($acc0,$mask); # inter-register transfers + + &pmuludq($mul1,$_n0q); # *=n0 + + &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 + &paddq ($car1,$acc0); + + &movd ($acc1,&DWP(4,$np)); # np[1] + &movd ($acc0,&DWP(4,$ap)); # ap[1] + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &inc ($j); # j++ +&set_label("1st",16); + &pmuludq($acc0,$mul0); # ap[j]*bp[0] + &pmuludq($acc1,$mul1); # np[j]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] + &paddq ($car1,$acc0); # +=ap[j]*bp[0]; + &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] + &psrlq ($car0,32); + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= + &psrlq ($car1,32); + + &lea ($j,&DWP(1,$j)); + &cmp ($j,$num); + &jl (&label("1st")); + + &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] + &pmuludq($acc1,$mul1); # np[num-1]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &paddq ($car1,$car0); + &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] + + &inc ($i); # i++ +&set_label("outer"); + &xor ($j,$j); # j=0 + + &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] + &movd ($mul1,&DWP(0,$ap)); # ap[0] + &movd ($temp,&DWP($frame,"esp")); # tp[0] + &movd ($car1,&DWP(0,$np)); # np[0] + &pmuludq($mul1,$mul0); # ap[0]*bp[i] + + &paddq ($mul1,$temp); # +=tp[0] + &movq ($acc0,$mul1); + &movq ($car0,$mul1); + &pand ($acc0,$mask); + + &pmuludq($mul1,$_n0q); # *=n0 + + &pmuludq($car1,$mul1); + &paddq ($car1,$acc0); + + &movd ($temp,&DWP($frame+4,"esp")); # tp[1] + &movd ($acc1,&DWP(4,$np)); # np[1] + &movd ($acc0,&DWP(4,$ap)); # ap[1] + + &psrlq ($car0,32); + &psrlq ($car1,32); + &paddq ($car0,$temp); # +=tp[1] + + &inc ($j); # j++ + &dec ($num); +&set_label("inner"); + &pmuludq($acc0,$mul0); # ap[j]*bp[i] + &pmuludq($acc1,$mul1); # np[j]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] + &pand ($acc0,$mask); + &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] + &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] + &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] + &psrlq ($car0,32); + &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= + &psrlq ($car1,32); + &paddq ($car0,$temp); # +=tp[j+1] + + &dec ($num); + &lea ($j,&DWP(1,$j)); # j++ + &jnz (&label("inner")); + + &mov ($num,$j); + &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] + &pmuludq($acc1,$mul1); # np[num-1]*m1 + &paddq ($car0,$acc0); # +=c0 + &paddq ($car1,$acc1); # +=c1 + + &movq ($acc0,$car0); + &pand ($acc0,$mask); + &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= + &psrlq ($car0,32); + &psrlq ($car1,32); + + &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] + &paddq ($car1,$car0); + &paddq ($car1,$temp); + &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] + + &lea ($i,&DWP(1,$i)); # i++ + &cmp ($i,$num); + &jle (&label("outer")); + + &emms (); # done with mmx bank + &jmp (&label("common_tail")); + +&set_label("non_sse2",16); +} + +if (0) { + &mov ("esp",$_sp); + &xor ("eax","eax"); # signal "not fast enough [yet]" + &jmp (&label("just_leave")); + # While the below code provides competitive performance for + # all key lengthes on modern Intel cores, it's still more + # than 10% slower for 4096-bit key elsewhere:-( "Competitive" + # means compared to the original integer-only assembler. + # 512-bit RSA sign is better by ~40%, but that's about all + # one can say about all CPUs... +} else { +$inp="esi"; # integer path uses these registers differently +$word="edi"; +$carry="ebp"; + + &mov ($inp,$_ap); + &lea ($carry,&DWP(1,$num)); + &mov ($word,$_bp); + &xor ($j,$j); # j=0 + &mov ("edx",$inp); + &and ($carry,1); # see if num is even + &sub ("edx",$word); # see if ap==bp + &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] + &or ($carry,"edx"); + &mov ($word,&DWP(0,$word)); # bp[0] + &jz (&label("bn_sqr_mont")); + &mov ($_bpend,"eax"); + &mov ("eax",&DWP(0,$inp)); + &xor ("edx","edx"); + +&set_label("mull",16); + &mov ($carry,"edx"); + &mul ($word); # ap[j]*bp[0] + &add ($carry,"eax"); + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] + &cmp ($j,$num); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &jl (&label("mull")); + + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*bp[0] + &mov ($word,$_n0); + &add ("eax",$carry); + &mov ($inp,$_np); + &adc ("edx",0); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= + &xor ($j,$j); + &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= + &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= + + &mov ("eax",&DWP(0,$inp)); # np[0] + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &mov ("eax",&DWP(4,$inp)); # np[1] + &adc ("edx",0); + &inc ($j); + + &jmp (&label("2ndmadd")); + +&set_label("1stmadd",16); + &mov ($carry,"edx"); + &mul ($word); # ap[j]*bp[i] + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] + &adc ("edx",0); + &cmp ($j,$num); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &jl (&label("1stmadd")); + + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*bp[i] + &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &mov ($word,$_n0); + &adc ("edx",0); + &mov ($inp,$_np); + &add ($carry,"eax"); + &adc ("edx",0); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &xor ($j,$j); + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= + &adc ($j,0); + &mov ("eax",&DWP(0,$inp)); # np[0] + &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= + &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= + + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &mov ("eax",&DWP(4,$inp)); # np[1] + &adc ("edx",0); + &mov ($j,1); + +&set_label("2ndmadd",16); + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] + &adc ("edx",0); + &cmp ($j,$num); + &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= + &jl (&label("2ndmadd")); + + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &adc ("edx",0); + &add ($carry,"eax"); + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= + + &xor ("eax","eax"); + &mov ($j,$_bp); # &bp[i] + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] + &lea ($j,&DWP(4,$j)); + &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= + &cmp ($j,$_bpend); + &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= + &je (&label("common_tail")); + + &mov ($word,&DWP(0,$j)); # bp[i+1] + &mov ($inp,$_ap); + &mov ($_bp,$j); # &bp[++i] + &xor ($j,$j); + &xor ("edx","edx"); + &mov ("eax",&DWP(0,$inp)); + &jmp (&label("1stmadd")); + +&set_label("bn_sqr_mont",16); +$sbit=$num; + &mov ($_num,$num); + &mov ($_bp,$j); # i=0 + + &mov ("eax",$word); # ap[0] + &mul ($word); # ap[0]*ap[0] + &mov (&DWP($frame,"esp"),"eax"); # tp[0]= + &mov ($sbit,"edx"); + &shr ("edx",1); + &and ($sbit,1); + &inc ($j); +&set_label("sqr",16); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] + &mov ($carry,"edx"); + &mul ($word); # ap[j]*ap[0] + &add ("eax",$carry); + &lea ($j,&DWP(1,$j)); + &adc ("edx",0); + &lea ($carry,&DWP(0,$sbit,"eax",2)); + &shr ("eax",31); + &cmp ($j,$_num); + &mov ($sbit,"eax"); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &jl (&label("sqr")); + + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] + &mov ($carry,"edx"); + &mul ($word); # ap[num-1]*ap[0] + &add ("eax",$carry); + &mov ($word,$_n0); + &adc ("edx",0); + &mov ($inp,$_np); + &lea ($carry,&DWP(0,$sbit,"eax",2)); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + &shr ("eax",31); + &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= + + &lea ($carry,&DWP(0,"eax","edx",2)); + &mov ("eax",&DWP(0,$inp)); # np[0] + &shr ("edx",31); + &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= + &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= + + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &mov ($num,$j); + &adc ("edx",0); + &mov ("eax",&DWP(4,$inp)); # np[1] + &mov ($j,1); + +&set_label("3rdmadd",16); + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= + + &mov ($carry,"edx"); + &mul ($word); # np[j+1]*m + &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] + &lea ($j,&DWP(2,$j)); + &adc ("edx",0); + &add ($carry,"eax"); + &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] + &adc ("edx",0); + &cmp ($j,$num); + &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= + &jl (&label("3rdmadd")); + + &mov ($carry,"edx"); + &mul ($word); # np[j]*m + &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] + &adc ("edx",0); + &add ($carry,"eax"); + &adc ("edx",0); + &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= + + &mov ($j,$_bp); # i + &xor ("eax","eax"); + &mov ($inp,$_ap); + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] + &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] + &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= + &cmp ($j,$num); + &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= + &je (&label("common_tail")); + + &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] + &lea ($j,&DWP(1,$j)); + &mov ("eax",$word); + &mov ($_bp,$j); # ++i + &mul ($word); # ap[i]*ap[i] + &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] + &adc ("edx",0); + &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= + &xor ($carry,$carry); + &cmp ($j,$num); + &lea ($j,&DWP(1,$j)); + &je (&label("sqrlast")); + + &mov ($sbit,"edx"); # zaps $num + &shr ("edx",1); + &and ($sbit,1); +&set_label("sqradd",16); + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] + &mov ($carry,"edx"); + &mul ($word); # ap[j]*ap[i] + &add ("eax",$carry); + &lea ($carry,&DWP(0,"eax","eax")); + &adc ("edx",0); + &shr ("eax",31); + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] + &lea ($j,&DWP(1,$j)); + &adc ("eax",0); + &add ($carry,$sbit); + &adc ("eax",0); + &cmp ($j,$_num); + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= + &mov ($sbit,"eax"); + &jle (&label("sqradd")); + + &mov ($carry,"edx"); + &lea ("edx",&DWP(0,$sbit,"edx",2)); + &shr ($carry,31); +&set_label("sqrlast"); + &mov ($word,$_n0); + &mov ($inp,$_np); + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] + + &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] + &mov ("eax",&DWP(0,$inp)); # np[0] + &adc ($carry,0); + &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= + &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= + + &mul ($word); # np[0]*m + &add ("eax",&DWP($frame,"esp")); # +=tp[0] + &lea ($num,&DWP(-1,$j)); + &adc ("edx",0); + &mov ($j,1); + &mov ("eax",&DWP(4,$inp)); # np[1] + + &jmp (&label("3rdmadd")); +} + +&set_label("common_tail",16); + &mov ($np,$_np); # load modulus pointer + &mov ($rp,$_rp); # load result pointer + &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] + + &mov ("eax",&DWP(0,$tp)); # tp[0] + &mov ($j,$num); # j=num-1 + &xor ($i,$i); # i=0 and clear CF! + +&set_label("sub",16); + &sbb ("eax",&DWP(0,$np,$i,4)); + &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] + &dec ($j); # doesn't affect CF! + &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] + &lea ($i,&DWP(1,$i)); # i++ + &jge (&label("sub")); + + &sbb ("eax",0); # handle upmost overflow bit + &and ($tp,"eax"); + ¬ ("eax"); + &mov ($np,$rp); + &and ($np,"eax"); + &or ($tp,$np); # tp=carry?tp:rp + +&set_label("copy",16); # copy or in-place refresh + &mov ("eax",&DWP(0,$tp,$num,4)); + &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] + &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector + &dec ($num); + &jge (&label("copy")); + + &mov ("esp",$_sp); # pull saved stack pointer + &mov ("eax",1); +&set_label("just_leave"); +&function_end("bn_mul_mont"); + +&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by "); + +&asm_finish(); diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h index df6eea29a..de39a7207 100644 --- a/crypto/bn/bn.h +++ b/crypto/bn/bn.h @@ -303,7 +303,14 @@ struct bn_mont_ctx_st BIGNUM N; /* The modulus */ BIGNUM Ni; /* R*(1/R mod N) - N*Ni = 1 * (Ni is only stored for bignum algorithm) */ +#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) + /* Non-default compile (can only happen with "enable-montasm"), + * uses the new type from 0.9.9 to accomodate two words: */ + BN_ULONG n0[2];/* least significant word(s) of Ni */ +#else + /* By default, use old type: */ BN_ULONG n0; /* least significant word of Ni */ +#endif int flags; }; diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c index 23e4ba514..e17c697e3 100644 --- a/crypto/bn/bn_mont.c +++ b/crypto/bn/bn_mont.c @@ -122,6 +122,12 @@ #define MONT_WORD /* use the faster word-based algorithm */ +#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) +static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont); +#endif + + + int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_MONT_CTX *mont, BN_CTX *ctx) { @@ -133,7 +139,11 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, if (num>1 && a->top==num && b->top==num) { if (bn_wexpand(r,num) == NULL) return(0); +#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */ + if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num)) +#else if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,&mont->n0,num)) +#endif { r->neg = a->neg^b->neg; r->top = num; @@ -157,7 +167,11 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, if (!BN_mul(tmp,a,b,ctx)) goto err; } /* reduce from aRR to aR */ +#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) + if (!BN_from_montgomery_word(r,tmp,mont)) goto err; +#else if (!BN_from_montgomery(r,tmp,mont,ctx)) goto err; +#endif bn_check_top(r); ret=1; err: @@ -165,6 +179,145 @@ err: return(ret); } +#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) +static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont) + { + BIGNUM *n; + BN_ULONG *ap,*np,*rp,n0,v,*nrp; + int al,nl,max,i,x,ri; + + n= &(mont->N); + /* mont->ri is the size of mont->N in bits (rounded up + to the word size) */ + al=ri=mont->ri/BN_BITS2; + + nl=n->top; + if ((al == 0) || (nl == 0)) { ret->top=0; return(1); } + + max=(nl+al+1); /* allow for overflow (no?) XXX */ + if (bn_wexpand(r,max) == NULL) return(0); + + r->neg^=n->neg; + np=n->d; + rp=r->d; + nrp= &(r->d[nl]); + + /* clear the top words of T */ + for (i=r->top; id[i]=0; + + r->top=max; + n0=mont->n0[0]; + +#ifdef BN_COUNT + fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl); +#endif + for (i=0; i= v) + continue; + else + { + if (((++nrp[0])&BN_MASK2) != 0) continue; + if (((++nrp[1])&BN_MASK2) != 0) continue; + for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ; + } + } + bn_correct_top(r); + + /* mont->ri will be a multiple of the word size and below code + * is kind of BN_rshift(ret,r,mont->ri) equivalent */ + if (r->top <= ri) + { + ret->top=0; + return(1); + } + al=r->top-ri; + + if (bn_wexpand(ret,ri) == NULL) return(0); + x=0-(((al-ri)>>(sizeof(al)*8-1))&1); + ret->top=x=(ri&~x)|(al&x); /* min(ri,al) */ + ret->neg=r->neg; + + rp=ret->d; + ap=&(r->d[ri]); + + { + size_t m1,m2; + + v=bn_sub_words(rp,ap,np,ri); + /* this ----------------^^ works even in alri) nrp=rp; else nrp=ap; */ + /* in other words if subtraction result is real, then + * trick unconditional memcpy below to perform in-place + * "refresh" instead of actual copy. */ + m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1); /* al>(sizeof(al)*8-1))&1); /* al>ri */ + m1|=m2; /* (al!=ri) */ + m1|=(0-(size_t)v); /* (al!=ri || v) */ + m1&=~m2; /* (al!=ri || v) && !al>ri */ + nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1)); + } + + /* 'iRR)); BN_init(&(ctx->N)); BN_init(&(ctx->Ni)); +#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */ + ctx->n0[0] = ctx->n0[1] = 0; +#else + ctx->n0 = 0; +#endif ctx->flags=0; } @@ -409,7 +568,11 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2; BN_zero(R); +#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */ + if (!(BN_set_bit(R,2*BN_BITS2))) goto err; /* R */ +#else if (!(BN_set_bit(R,BN_BITS2))) goto err; /* R */ +#endif buf[0]=mod->d[0]; /* tmod = N mod word size */ buf[1]=0; @@ -419,6 +582,35 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) tmod.top = buf[0] != 0 ? 1 : 0; tmod.dmax=2; tmod.neg=0; + +#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) + tmod.top=0; + if ((buf[0] = mod->d[0])) tmod.top=1; + if ((buf[1] = mod->top>1 ? mod->d[1] : 0)) tmod.top=2; + + if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL) + goto err; + if (!BN_lshift(Ri,Ri,2*BN_BITS2)) goto err; /* R*Ri */ + if (!BN_is_zero(Ri)) + { + if (!BN_sub_word(Ri,1)) goto err; + } + else /* if N mod word size == 1 */ + { + if (bn_expand(Ri,(int)sizeof(BN_ULONG)*2) == NULL) + goto err; + /* Ri-- (mod double word size) */ + Ri->neg=0; + Ri->d[0]=BN_MASK2; + Ri->d[1]=BN_MASK2; + Ri->top=2; + } + if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err; + /* Ni = (R*Ri-1)/N, + * keep only couple of least significant words: */ + mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0; + mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0; +#else /* Ri = R^-1 mod N*/ if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL) goto err; @@ -435,6 +627,7 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) /* Ni = (R*Ri-1)/N, * keep only least significant word: */ mont->n0 = (Ri->top > 0) ? Ri->d[0] : 0; +#endif } #else /* !MONT_WORD */ { /* bignum version */ @@ -470,7 +663,12 @@ BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from) if (!BN_copy(&(to->N),&(from->N))) return NULL; if (!BN_copy(&(to->Ni),&(from->Ni))) return NULL; to->ri=from->ri; +#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */ + to->n0[0]=from->n0[0]; + to->n0[1]=from->n0[1]; +#else to->n0=from->n0; +#endif return(to); } diff --git a/crypto/perlasm/x86ms.pl b/crypto/perlasm/x86ms.pl index ebf38adb1..a0be2934c 100644 --- a/crypto/perlasm/x86ms.pl +++ b/crypto/perlasm/x86ms.pl @@ -146,6 +146,7 @@ sub main'exch { &out2("xchg",@_); } sub main'cmp { &out2("cmp",@_); } sub main'lea { &out2("lea",@_); } sub main'mul { &out1("mul",@_); } +sub main'imul { &out2("imul",@_); } sub main'div { &out1("div",@_); } sub main'dec { &out1("dec",@_); } sub main'inc { &out1("inc",@_); } diff --git a/crypto/perlasm/x86nasm.pl b/crypto/perlasm/x86nasm.pl index 863c7e9d6..fa38f89c0 100644 --- a/crypto/perlasm/x86nasm.pl +++ b/crypto/perlasm/x86nasm.pl @@ -154,6 +154,7 @@ sub main'exch { &out2("xchg",@_); } sub main'cmp { &out2("cmp",@_); } sub main'lea { &out2("lea",@_); } sub main'mul { &out1("mul",@_); } +sub main'imul { &out2("imul",@_); } sub main'div { &out1("div",@_); } sub main'dec { &out1("dec",@_); } sub main'inc { &out1("inc",@_); } diff --git a/crypto/perlasm/x86unix.pl b/crypto/perlasm/x86unix.pl index 53507b6b8..a4c947165 100644 --- a/crypto/perlasm/x86unix.pl +++ b/crypto/perlasm/x86unix.pl @@ -171,6 +171,7 @@ sub main'exch { &out2($_[0]=~/%[a-d][lh]/?"xchgb":"xchgl",@_); } sub main'cmp { &out2("cmpl",@_); } sub main'lea { &out2("leal",@_); } sub main'mul { &out1("mull",@_); } +sub main'imul { &out2("imull",@_); } sub main'div { &out1("divl",@_); } sub main'jmp { &out1("jmp",@_); } sub main'jmp_ptr { &out1p("jmp",@_); }