Unobtrusive backport of 32-bit x86 Montgomery improvements from 0.9.9-dev:

you need to use "enable-montasm" to see a difference. (Huge speed advantage, but BN_MONT_CTX is not binary compatible, so this can't be enabled by default in the 0.9.8 branch.) The CHANGES entry also covers the 64-bit x86 backport in November 2007 by appro.
2008-05-01 23:11:34 +00:00
parent db533c96e3
commit 812d8a176c
10 changed files with 858 additions and 4 deletions
--- a/22
+++ b/22
@@ -4,6 +4,28 @@

 Changes between 0.9.8g and 0.9.8h  [xx XXX xxxx]

+  *) Partial backport from 0.9.9-dev:
+
+     New candidate for BIGNUM assembler implementation, bn_mul_mont,
+     dedicated Montgomery multiplication procedure, is introduced.
+     While 0.9.9-dev has assembler for various architectures, here
+     in the 0.9.8 branch, only x86_64 is available by default.
+
+     With Configure option "enable-montasm" (which exists only for
+     this backport), the 32-bit x86 assembler implementation can be
+     activated at compile-time.  In 0.9.9-dev, BN_MONT_CTX is modified
+     to allow bn_mul_mont to reach for higher "64-bit" performance on
+     certain 32-bit targets.  With "enable-montasm", this BN_MONT_CTX
+     change is activated in the 0.9.8 branch.
+
+     Warning: Using "enable-montasm" thus means losing binary
+     compatibility between patchlevels!  (I.e., applications will
+     have to be recompiled to match the particular library.)
+     So you may want to avoid this setting for shared libraries.
+     Use at your own risk.
+
+     [Andy Polyakov (32-bit x86 backport: Bodo Moeller)]
+
  *) Add TLS session ticket callback. This allows an application to set
     TLS ticket cipher and HMAC keys rather than relying on hardcoded fixed
     values. This is useful for key rollover for example where several key
--- a/22
+++ b/22
@@ -10,7 +10,7 @@ use strict;

 # see INSTALL for instructions.

-my $usage="Usage: Configure [no-<cipher> ...] [enable-<cipher> ...] [-Dxxx] [-lxxx] [-Lxxx] [-fxxx] [-Kxxx] [no-hw-xxx|no-hw] [[no-]threads] [[no-]shared] [[no-]zlib|zlib-dynamic] [no-asm] [no-dso] [no-krb5] [386] [--prefix=DIR] [--openssldir=OPENSSLDIR] [--with-xxx[=vvv]] [--test-sanity] os/compiler[:flags]\n";
+my $usage="Usage: Configure [no-<cipher> ...] [enable-<cipher> ...] [-Dxxx] [-lxxx] [-Lxxx] [-fxxx] [-Kxxx] [no-hw-xxx|no-hw] [[no-]threads] [[no-]shared] [[no-]zlib|zlib-dynamic] [enable-montasm] [no-asm] [no-dso] [no-krb5] [386] [--prefix=DIR] [--openssldir=OPENSSLDIR] [--with-xxx[=vvv]] [--test-sanity] os/compiler[:flags]\n";

 # Options:
 #
@@ -54,6 +54,8 @@ my $usage="Usage: Configure [no-<cipher> ...] [enable-<cipher> ...] [-Dxxx] [-lx
 # [no-]zlib     [don't] compile support for zlib compression.
 # zlib-dynamic	Like "zlib", but the zlib library is expected to be a shared
 #		library and will be loaded in run-time by the OpenSSL library.
+# enable-montasm 0.9.8 branch only: enable Montgomery x86 assembler backport
+#               from 0.9.9
 # 386           generate 80386 code
 # no-sse2	disables IA-32 SSE2 code, above option implies no-sse2
 # no-<cipher>   build without specified algorithm (rsa, idea, rc5, ...)
@@ -114,9 +116,9 @@ my $tlib="-lnsl -lsocket";
 my $bits1="THIRTY_TWO_BIT ";
 my $bits2="SIXTY_FOUR_BIT ";

-my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o rc4_skey.o:rm86-elf.o:r586-elf.o";
-my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o rc4_skey.o:rm86-cof.o:r586-cof.o";
-my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o rc4_skey.o:rm86-out.o:r586-out.o";
+my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o MAYBE-MO86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o rc4_skey.o:rm86-elf.o:r586-elf.o";
+my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o MAYBE-MO86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o rc4_skey.o:rm86-cof.o:r586-cof.o";
+my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o MAYBE-MO86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o rc4_skey.o:rm86-out.o:r586-out.o";

 my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o::";
 my $ia64_asm=":bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o:::sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o::";
@@ -580,6 +582,7 @@ my $no_shared=0; # but "no-shared" is default
 my $zlib=1;      # but "no-zlib" is default
 my $no_krb5=0;   # but "no-krb5" is implied unless "--with-krb5-..." is used
 my $no_rfc3779=1; # but "no-rfc3779" is default
+my $montasm=1;   # but "no-montasm" is default
 my $no_asm=0;
 my $no_dso=0;
 my $no_gmp=0;
@@ -616,6 +619,7 @@ my %disabled = ( # "what"         => "comment"
                 "cms"            => "default",
                 "gmp"            => "default",
                 "mdc2"           => "default",
+                 "montasm"        => "default", # explicit option in 0.9.8 only (implicitly enabled in 0.9.9)
                 "rc5"            => "default",
                 "rfc3779"        => "default",
                 "seed"           => "default",
@@ -895,6 +899,8 @@ foreach (sort (keys %disabled))
 		{ $no_shared = 1; }
 	elsif (/^zlib$/)
 		{ $zlib = 0; }
+	elsif (/^montasm$/)
+		{ $montasm = 0; }
 	elsif (/^static-engine$/)
 		{ }
 	elsif (/^zlib-dynamic$/)
@@ -1121,6 +1127,14 @@ if ($no_asm)
 	$cpuid_obj=$bn_obj=$des_obj=$aes_obj=$bf_obj=$cast_obj=$rc4_obj=$rc5_obj="";
 	$sha1_obj=$md5_obj=$rmd160_obj="";
 	}
+if ($montasm)
+	{
+	$bn_obj =~ s/MAYBE-MO86-/mo86-/;
+	}
+else
+	{
+	$bn_obj =~ s/MAYBE-MO86-[a-z.]*//;
+	}

 if (!$no_shared)
 	{
--- a/crypto/bn/.cvsignore
+++ b/crypto/bn/.cvsignore
@@ -4,3 +4,4 @@ Makefile.save
 semantic.cache
 co86-elf.s
 bn86-elf.s
+mo86-elf.s
--- a/crypto/bn/Makefile
+++ b/crypto/bn/Makefile
@@ -67,16 +67,22 @@ bn86-elf.s:	asm/bn-586.pl ../perlasm/x86asm.pl
 	(cd asm; $(PERL) bn-586.pl elf $(CFLAGS) > ../$@)
 co86-elf.s:	asm/co-586.pl ../perlasm/x86asm.pl
 	(cd asm; $(PERL) co-586.pl elf $(CFLAGS) > ../$@)
+mo86-elf.s:	asm/mo-586.pl ../perlasm/x86asm.pl
+	(cd asm; $(PERL) mo-586.pl elf $(CFLAGS) > ../$@)
 # COFF
 bn86-cof.s: asm/bn-586.pl ../perlasm/x86asm.pl
 	(cd asm; $(PERL) bn-586.pl coff $(CFLAGS) > ../$@)
 co86-cof.s: asm/co-586.pl ../perlasm/x86asm.pl
 	(cd asm; $(PERL) co-586.pl coff $(CFLAGS) > ../$@)
+mo86-cof.s: asm/mo-586.pl ../perlasm/x86asm.pl
+	(cd asm; $(PERL) mo-586.pl coff $(CFLAGS) > ../$@)
 # a.out
 bn86-out.s: asm/bn-586.pl ../perlasm/x86asm.pl
 	(cd asm; $(PERL) bn-586.pl a.out $(CFLAGS) > ../$@)
 co86-out.s: asm/co-586.pl ../perlasm/x86asm.pl
 	(cd asm; $(PERL) co-586.pl a.out $(CFLAGS) > ../$@)
+mo86-out.s: asm/mo-586.pl ../perlasm/x86asm.pl
+	(cd asm; $(PERL) mo-586.pl a.out $(CFLAGS) > ../$@)

 sparcv8.o:	asm/sparcv8.S
 	$(CC) $(CFLAGS) -c asm/sparcv8.S
--- a/crypto/bn/asm/mo-586.pl
+++ b/crypto/bn/asm/mo-586.pl
@@ -0,0 +1,603 @@
+#!/usr/bin/env perl
+
+# This is crypto/bn/asm/x86-mont.pl (with asciz from crypto/perlasm/x86asm.pl)
+# from OpenSSL 0.9.9-dev 
+
+sub ::asciz
+{ my @str=unpack("C*",shift);
+    push @str,0;
+    while ($#str>15) {
+	&data_byte(@str[0..15]);
+	foreach (0..15) { shift @str; }
+    }
+    &data_byte(@str) if (@str);
+}
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+push(@INC,"perlasm","../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi";	$tp="esi";		# overlapping variables!!!
+$rp="edi";	$bp="edi";		# overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp");			# stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32;				# size of above frame rounded up to 16n
+
+	&xor	("eax","eax");
+	&mov	("edi",&wparam(5));	# int num
+	&cmp	("edi",4);
+	&jl	(&label("just_leave"));
+
+	&lea	("esi",&wparam(0));	# put aside pointer to argument block
+	&lea	("edx",&wparam(1));	# load ap
+	&mov	("ebp","esp");		# saved stack pointer!
+	&add	("edi",2);		# extra two words on top of tp
+	&neg	("edi");
+	&lea	("esp",&DWP(-$frame,"esp","edi",4));	# alloca($frame+4*(num+2))
+	&neg	("edi");
+
+	# minimize cache contention by arraning 2K window between stack
+	# pointer and ap argument [np is also position sensitive vector,
+	# but it's assumed to be near ap, as it's allocated at ~same
+	# time].
+	&mov	("eax","esp");
+	&sub	("eax","edx");
+	&and	("eax",2047);
+	&sub	("esp","eax");		# this aligns sp and ap modulo 2048
+
+	&xor	("edx","esp");
+	&and	("edx",2048);
+	&xor	("edx",2048);
+	&sub	("esp","edx");		# this splits them apart modulo 4096
+
+	&and	("esp",-64);		# align to cache line
+
+	################################# load argument block...
+	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+	&mov	("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+	#&mov	("edi",&DWP(5*4,"esi"));# int num
+
+	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
+	&mov	($_rp,"eax");		# ... save a copy of argument block
+	&mov	($_ap,"ebx");
+	&mov	($_bp,"ecx");
+	&mov	($_np,"edx");
+	&mov	($_n0,"esi");
+	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
+	#&mov	($_num,$num);		# redundant as $num is not reused
+	&mov	($_sp,"ebp");		# saved stack pointer!
+
+if($sse2) {
+$acc0="mm0";	# mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+	&picmeup("eax","OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,"eax"),26);
+	&jnc	(&label("non_sse2"));
+
+	&mov	("eax",-1);
+	&movd	($mask,"eax");		# mask 32 lower bits
+
+	&mov	($ap,$_ap);		# load input pointers
+	&mov	($bp,$_bp);
+	&mov	($np,$_np);
+
+	&xor	($i,$i);		# i=0
+	&xor	($j,$j);		# j=0
+
+	&movd	($mul0,&DWP(0,$bp));		# bp[0]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
+	&movq	($car0,$mul1);
+	&movq	($acc0,$mul1);			# I wish movd worked for
+	&pand	($acc0,$mask);			# inter-register transfers
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
+	&paddq	($car1,$acc0);
+
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&inc	($j);				# j++
+&set_label("1st",16);
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
+	&psrlq	($car1,32);
+
+	&lea	($j,&DWP(1,$j));
+	&cmp	($j,$num);
+	&jl	(&label("1st"));
+
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&paddq	($car1,$car0);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&inc	($i);				# i++
+&set_label("outer");
+	&xor	($j,$j);			# j=0
+
+	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
+
+	&paddq	($mul1,$temp);			# +=tp[0]
+	&movq	($acc0,$mul1);
+	&movq	($car0,$mul1);
+	&pand	($acc0,$mask);
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);
+	&paddq	($car1,$acc0);
+
+	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[1]
+
+	&inc	($j);				# j++
+	&dec	($num);
+&set_label("inner");
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[j+1]
+
+	&dec	($num);
+	&lea	($j,&DWP(1,$j));		# j++
+	&jnz	(&label("inner"));
+
+	&mov	($num,$j);
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
+	&paddq	($car1,$car0);
+	&paddq	($car1,$temp);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&lea	($i,&DWP(1,$i));		# i++
+	&cmp	($i,$num);
+	&jle	(&label("outer"));
+
+	&emms	();				# done with mmx bank
+	&jmp	(&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
+
+if (0) {
+	&mov	("esp",$_sp);
+	&xor	("eax","eax");	# signal "not fast enough [yet]"
+	&jmp	(&label("just_leave"));
+	# While the below code provides competitive performance for
+	# all key lengthes on modern Intel cores, it's still more
+	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+	# means compared to the original integer-only assembler.
+	# 512-bit RSA sign is better by ~40%, but that's about all
+	# one can say about all CPUs...
+} else {
+$inp="esi";	# integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+	&mov	($inp,$_ap);
+	&lea	($carry,&DWP(1,$num));
+	&mov	($word,$_bp);
+	&xor	($j,$j);				# j=0
+	&mov	("edx",$inp);
+	&and	($carry,1);				# see if num is even
+	&sub	("edx",$word);				# see if ap==bp
+	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
+	&or	($carry,"edx");
+	&mov	($word,&DWP(0,$word));			# bp[0]
+	&jz	(&label("bn_sqr_mont"));
+	&mov	($_bpend,"eax");
+	&mov	("eax",&DWP(0,$inp));
+	&xor	("edx","edx");
+
+&set_label("mull",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[0]
+	&add	($carry,"eax");
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("mull"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[0]
+	 &mov	($word,$_n0);
+	&add	("eax",$carry);
+	 &mov	($inp,$_np);
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
+	&xor	($j,$j);
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&inc	($j);
+
+	&jmp	(&label("2ndmadd"));
+
+&set_label("1stmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[i]
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("1stmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[i]
+	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&xor	($j,$j);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
+	&adc	($j,0);
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&mov	($j,1);
+
+&set_label("2ndmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
+	&jl	(&label("2ndmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&xor	("eax","eax");
+	 &mov	($j,$_bp);				# &bp[i]
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	 &lea	($j,&DWP(4,$j));
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	 &cmp	($j,$_bpend);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(0,$j));			# bp[i+1]
+	&mov	($inp,$_ap);
+	&mov	($_bp,$j);				# &bp[++i]
+	&xor	($j,$j);
+	&xor	("edx","edx");
+	&mov	("eax",&DWP(0,$inp));
+	&jmp	(&label("1stmadd"));
+
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+	&mov	($_num,$num);
+	&mov	($_bp,$j);				# i=0
+
+	&mov	("eax",$word);				# ap[0]
+	&mul	($word);				# ap[0]*ap[0]
+	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
+	&mov	($sbit,"edx");
+	&shr	("edx",1);
+	&and	($sbit,1);
+	&inc	($j);
+&set_label("sqr",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[0]
+	&add	("eax",$carry);
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	&shr	("eax",31);
+	&cmp	($j,$_num);
+	&mov	($sbit,"eax");
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("sqr"));
+
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*ap[0]
+	&add	("eax",$carry);
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+	&shr	("eax",31);
+	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
+
+	&lea	($carry,&DWP(0,"eax","edx",2));
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&shr	("edx",31);
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	($num,$j);
+	&adc	("edx",0);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&mov	($j,1);
+
+&set_label("3rdmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j+1]*m
+	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
+	&lea	($j,&DWP(2,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("3rdmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&mov	($j,$_bp);				# i
+	&xor	("eax","eax");
+	&mov	($inp,$_ap);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	&cmp	($j,$num);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
+	&lea	($j,&DWP(1,$j));
+	&mov	("eax",$word);
+	&mov	($_bp,$j);				# ++i
+	&mul	($word);				# ap[i]*ap[i]
+	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
+	&adc	("edx",0);
+	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
+	&xor	($carry,$carry);
+	&cmp	($j,$num);
+	&lea	($j,&DWP(1,$j));
+	&je	(&label("sqrlast"));
+
+	&mov	($sbit,"edx");				# zaps $num
+	&shr	("edx",1);
+	&and	($sbit,1);
+&set_label("sqradd",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[i]
+	&add	("eax",$carry);
+	&lea	($carry,&DWP(0,"eax","eax"));
+	&adc	("edx",0);
+	&shr	("eax",31);
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("eax",0);
+	&add	($carry,$sbit);
+	&adc	("eax",0);
+	&cmp	($j,$_num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&mov	($sbit,"eax");
+	&jle	(&label("sqradd"));
+
+	&mov	($carry,"edx");
+	&lea	("edx",&DWP(0,$sbit,"edx",2));
+	&shr	($carry,31);
+&set_label("sqrlast");
+	&mov	($word,$_n0);
+	&mov	($inp,$_np);
+	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&adc	($carry,0);
+	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&lea	($num,&DWP(-1,$j));
+	&adc	("edx",0);
+	&mov	($j,1);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+
+	&jmp	(&label("3rdmadd"));
+}
+
+&set_label("common_tail",16);
+	&mov	($np,$_np);			# load modulus pointer
+	&mov	($rp,$_rp);			# load result pointer
+	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
+
+	&mov	("eax",&DWP(0,$tp));		# tp[0]
+	&mov	($j,$num);			# j=num-1
+	&xor	($i,$i);			# i=0 and clear CF!
+
+&set_label("sub",16);
+	&sbb	("eax",&DWP(0,$np,$i,4));
+	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
+	&dec	($j);				# doesn't affect CF!
+	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
+	&lea	($i,&DWP(1,$i));		# i++
+	&jge	(&label("sub"));
+
+	&sbb	("eax",0);			# handle upmost overflow bit
+	&and	($tp,"eax");
+	&not	("eax");
+	&mov	($np,$rp);
+	&and	($np,"eax");
+	&or	($tp,$np);			# tp=carry?tp:rp
+
+&set_label("copy",16);				# copy or in-place refresh
+	&mov	("eax",&DWP(0,$tp,$num,4));
+	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
+	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
+	&dec	($num);
+	&jge	(&label("copy"));
+
+	&mov	("esp",$_sp);		# pull saved stack pointer
+	&mov	("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@@ -303,7 +303,14 @@ struct bn_mont_ctx_st
 	BIGNUM N;      /* The modulus */
 	BIGNUM Ni;     /* R*(1/R mod N) - N*Ni = 1
 	                * (Ni is only stored for bignum algorithm) */
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+	/* Non-default compile (can only happen with "enable-montasm"),
+	 * uses the new type from 0.9.9 to accomodate two words: */
+	BN_ULONG n0[2];/* least significant word(s) of Ni */
+#else
+	/* By default, use old type: */
 	BN_ULONG n0;   /* least significant word of Ni */
+#endif
 	int flags;
 	};

--- a/crypto/bn/bn_mont.c
+++ b/crypto/bn/bn_mont.c
@@ -122,6 +122,12 @@

 #define MONT_WORD /* use the faster word-based algorithm */

+#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont);
+#endif
+
+
+
 int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
 			  BN_MONT_CTX *mont, BN_CTX *ctx)
 	{
@@ -133,7 +139,11 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
 	if (num>1 && a->top==num && b->top==num)
 		{
 		if (bn_wexpand(r,num) == NULL) return(0);
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */
+		if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num))
+#else
 		if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,&mont->n0,num))
+#endif
 			{
 			r->neg = a->neg^b->neg;
 			r->top = num;
@@ -157,7 +167,11 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
 		if (!BN_mul(tmp,a,b,ctx)) goto err;
 		}
 	/* reduce from aRR to aR */
+#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+	if (!BN_from_montgomery_word(r,tmp,mont)) goto err;
+#else
 	if (!BN_from_montgomery(r,tmp,mont,ctx)) goto err;
+#endif
 	bn_check_top(r);
 	ret=1;
 err:
@@ -165,6 +179,145 @@ err:
 	return(ret);
 	}

+#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
+	{
+	BIGNUM *n;
+	BN_ULONG *ap,*np,*rp,n0,v,*nrp;
+	int al,nl,max,i,x,ri;
+
+	n= &(mont->N);
+	/* mont->ri is the size of mont->N in bits (rounded up
+	   to the word size) */
+	al=ri=mont->ri/BN_BITS2;
+
+	nl=n->top;
+	if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
+
+	max=(nl+al+1); /* allow for overflow (no?) XXX */
+	if (bn_wexpand(r,max) == NULL) return(0);
+
+	r->neg^=n->neg;
+	np=n->d;
+	rp=r->d;
+	nrp= &(r->d[nl]);
+
+	/* clear the top words of T */
+	for (i=r->top; i<max; i++) /* memset? XXX */
+		r->d[i]=0;
+
+	r->top=max;
+	n0=mont->n0[0];
+
+#ifdef BN_COUNT
+	fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
+#endif
+	for (i=0; i<nl; i++)
+		{
+#ifdef __TANDEM
+                {
+                   long long t1;
+                   long long t2;
+                   long long t3;
+                   t1 = rp[0] * (n0 & 0177777);
+                   t2 = 037777600000l;
+                   t2 = n0 & t2;
+                   t3 = rp[0] & 0177777;
+                   t2 = (t3 * t2) & BN_MASK2;
+                   t1 = t1 + t2;
+                   v=bn_mul_add_words(rp,np,nl,(BN_ULONG) t1);
+                }
+#else
+		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
+#endif
+		nrp++;
+		rp++;
+		if (((nrp[-1]+=v)&BN_MASK2) >= v)
+			continue;
+		else
+			{
+			if (((++nrp[0])&BN_MASK2) != 0) continue;
+			if (((++nrp[1])&BN_MASK2) != 0) continue;
+			for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ;
+			}
+		}
+	bn_correct_top(r);
+
+	/* mont->ri will be a multiple of the word size and below code
+	 * is kind of BN_rshift(ret,r,mont->ri) equivalent */
+	if (r->top <= ri)
+		{
+		ret->top=0;
+		return(1);
+		}
+	al=r->top-ri;
+
+	if (bn_wexpand(ret,ri) == NULL) return(0);
+	x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
+	ret->top=x=(ri&~x)|(al&x);	/* min(ri,al) */
+	ret->neg=r->neg;
+
+	rp=ret->d;
+	ap=&(r->d[ri]);
+
+	{
+	size_t m1,m2;
+
+	v=bn_sub_words(rp,ap,np,ri);
+	/* this ----------------^^ works even in al<ri case
+	 * thanks to zealous zeroing of top of the vector in the
+	 * beginning. */
+
+	/* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
+	/* in other words if subtraction result is real, then
+	 * trick unconditional memcpy below to perform in-place
+	 * "refresh" instead of actual copy. */
+	m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1);	/* al<ri */
+	m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1);	/* al>ri */
+	m1|=m2;			/* (al!=ri) */
+	m1|=(0-(size_t)v);	/* (al!=ri || v) */
+	m1&=~m2;		/* (al!=ri || v) && !al>ri */
+	nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1));
+	}
+
+	/* 'i<ri' is chosen to eliminate dependency on input data, even
+	 * though it results in redundant copy in al<ri case. */
+	for (i=0,ri-=4; i<ri; i+=4)
+		{
+		BN_ULONG t1,t2,t3,t4;
+		
+		t1=nrp[i+0];
+		t2=nrp[i+1];
+		t3=nrp[i+2];	ap[i+0]=0;
+		t4=nrp[i+3];	ap[i+1]=0;
+		rp[i+0]=t1;	ap[i+2]=0;
+		rp[i+1]=t2;	ap[i+3]=0;
+		rp[i+2]=t3;
+		rp[i+3]=t4;
+		}
+	for (ri+=4; i<ri; i++)
+		rp[i]=nrp[i], ap[i]=0;
+	bn_correct_top(r);
+	bn_correct_top(ret);
+	bn_check_top(ret);
+
+	return(1);
+	}
+
+int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
+	     BN_CTX *ctx)
+	{
+	int retn=0;
+	BIGNUM *t;
+
+	BN_CTX_start(ctx);
+	if ((t = BN_CTX_get(ctx)) && BN_copy(t,a))
+		retn = BN_from_montgomery_word(ret,t,mont);
+	BN_CTX_end(ctx);
+	return retn;
+	}
+#else
+
 int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
 	     BN_CTX *ctx)
 	{
@@ -357,6 +510,7 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
 	BN_CTX_end(ctx);
 	return(retn);
 	}
+#endif /* defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) */

 BN_MONT_CTX *BN_MONT_CTX_new(void)
 	{
@@ -376,6 +530,11 @@ void BN_MONT_CTX_init(BN_MONT_CTX *ctx)
 	BN_init(&(ctx->RR));
 	BN_init(&(ctx->N));
 	BN_init(&(ctx->Ni));
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */
+	ctx->n0[0] = ctx->n0[1] = 0;
+#else
+	ctx->n0 = 0;
+#endif
 	ctx->flags=0;
 	}

@@ -409,7 +568,11 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)

 		mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2;
 		BN_zero(R);
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */
+		if (!(BN_set_bit(R,2*BN_BITS2))) goto err;	/* R */
+#else
 		if (!(BN_set_bit(R,BN_BITS2))) goto err;	/* R */
+#endif

 		buf[0]=mod->d[0]; /* tmod = N mod word size */
 		buf[1]=0;
@@ -419,6 +582,35 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
 		tmod.top = buf[0] != 0 ? 1 : 0;
 		tmod.dmax=2;
 		tmod.neg=0;
+
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+								tmod.top=0;
+		if ((buf[0] = mod->d[0]))			tmod.top=1;
+		if ((buf[1] = mod->top>1 ? mod->d[1] : 0))	tmod.top=2;
+
+		if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
+			goto err;
+		if (!BN_lshift(Ri,Ri,2*BN_BITS2)) goto err; /* R*Ri */
+		if (!BN_is_zero(Ri))
+			{
+			if (!BN_sub_word(Ri,1)) goto err;
+			}
+		else /* if N mod word size == 1 */
+			{
+			if (bn_expand(Ri,(int)sizeof(BN_ULONG)*2) == NULL)
+				goto err;
+			/* Ri-- (mod double word size) */
+			Ri->neg=0;
+			Ri->d[0]=BN_MASK2;
+			Ri->d[1]=BN_MASK2;
+			Ri->top=2;
+			}
+		if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err;
+		/* Ni = (R*Ri-1)/N,
+		 * keep only couple of least significant words: */
+		mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+		mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0;
+#else
 							/* Ri = R^-1 mod N*/
 		if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
 			goto err;
@@ -435,6 +627,7 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
 		/* Ni = (R*Ri-1)/N,
 		 * keep only least significant word: */
 		mont->n0 = (Ri->top > 0) ? Ri->d[0] : 0;
+#endif
 		}
 #else /* !MONT_WORD */
 		{ /* bignum version */
@@ -470,7 +663,12 @@ BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from)
 	if (!BN_copy(&(to->N),&(from->N))) return NULL;
 	if (!BN_copy(&(to->Ni),&(from->Ni))) return NULL;
 	to->ri=from->ri;
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */
+	to->n0[0]=from->n0[0];
+	to->n0[1]=from->n0[1];
+#else
 	to->n0=from->n0;
+#endif
 	return(to);
 	}

--- a/crypto/perlasm/x86ms.pl
+++ b/crypto/perlasm/x86ms.pl
@@ -146,6 +146,7 @@ sub main'exch	{ &out2("xchg",@_); }
 sub main'cmp	{ &out2("cmp",@_); }
 sub main'lea	{ &out2("lea",@_); }
 sub main'mul	{ &out1("mul",@_); }
+sub main'imul	{ &out2("imul",@_); }
 sub main'div	{ &out1("div",@_); }
 sub main'dec	{ &out1("dec",@_); }
 sub main'inc	{ &out1("inc",@_); }
--- a/crypto/perlasm/x86nasm.pl
+++ b/crypto/perlasm/x86nasm.pl
@@ -154,6 +154,7 @@ sub main'exch	{ &out2("xchg",@_); }
 sub main'cmp	{ &out2("cmp",@_); }
 sub main'lea	{ &out2("lea",@_); }
 sub main'mul	{ &out1("mul",@_); }
+sub main'imul	{ &out2("imul",@_); }
 sub main'div	{ &out1("div",@_); }
 sub main'dec	{ &out1("dec",@_); }
 sub main'inc	{ &out1("inc",@_); }
--- a/crypto/perlasm/x86unix.pl
+++ b/crypto/perlasm/x86unix.pl
@@ -171,6 +171,7 @@ sub main'exch	{ &out2($_[0]=~/%[a-d][lh]/?"xchgb":"xchgl",@_); }
 sub main'cmp	{ &out2("cmpl",@_); }
 sub main'lea	{ &out2("leal",@_); }
 sub main'mul	{ &out1("mull",@_); }
+sub main'imul	{ &out2("imull",@_); }
 sub main'div	{ &out1("divl",@_); }
 sub main'jmp	{ &out1("jmp",@_); }
 sub main'jmp_ptr { &out1p("jmp",@_); }