Add reference implementation for bn_[mul|sqr]_mont, new candidates for

assembler implementation.
2005-10-04 06:19:29 +00:00 · 2005-10-04 06:19:29 +00:00 · e738280547
commit e738280547
parent 8265328def
3 changed files with 142 additions and 2 deletions
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@ -727,6 +727,8 @@ int RAND_pseudo_bytes(unsigned char *buf,int num);
 	bn_pollute(a); \
 	}

+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num);
+void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num);
 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
 void     bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
--- a/crypto/bn/bn_asm.c
+++ b/crypto/bn/bn_asm.c
@ -820,18 +820,95 @@ void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 	r[6]=c1;
 	r[7]=c2;
 	}
+
+#ifdef OPENSSL_BN_ASM_MONT
+/*
+ * This is essentially reference implementation, which may or may not
+ * result in performance improvement. E.g. on IA-32 this does give 40%
+ * faster rsa1024 private key operations and 10% faster rsa4096 ones,
+ * while on AMD64 it improves rsa1024 sign only by 10%, but *worsens*
+ * rsa4096 sign by 15%. Once again, it's a reference implementation,
+ * one to be used as start-point for platform-specific assembler.
+ */
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num)
+	{
+	BN_ULONG c0,c1,ml,*tp;
+#ifdef mul64
+	BN_ULONG mh;
+#endif
+	volatile BN_ULONG *vp;
+	int i=0,j;
+
+	vp = tp = alloca((num+2)*sizeof(BN_ULONG));
+
+	tp[num]   = bn_mul_words(tp,ap,num,bp[0]);
+	tp[num+1] = 0;
+	goto enter;
+
+	for(i=0;i<num;i++)
+		{
+		c0 = bn_mul_add_words(tp,ap,num,bp[i]);
+		c1 = (tp[num] + c0)&BN_MASK2;
+		tp[num]   = c1;
+		tp[num+1] = (c1<c0?1:0);
+	enter:
+		c1  = tp[0];
+		ml = (c1*n0)&BN_MASK2;
+		c0 = 0;
+#ifdef mul64
+		mh = HBITS(ml);
+		ml = LBITS(ml);
+		mul_add(c1,np[0],ml,mh,c0);
+#else
+		mul_add(c1,ml,np[0],c0);
+#endif
+		for(j=1;j<num;j++)
+			{
+			c1 = tp[j];
+#ifdef mul64
+			mul_add(c1,np[j],ml,mh,c0);
+#else
+			mul_add(c1,ml,np[j],c0);
+#endif
+			tp[j-1] = c1&BN_MASK2;
+			}
+		c1        = (tp[num] + c0)&BN_MASK2;
+		tp[num-1] = c1;
+		tp[num]   = tp[num+1] + (c1<c0?1:0);
+		}
+
+	if (tp[num]!=0 || tp[num-1]>=np[num-1])
+		{
+		c0 = bn_sub_words(rp,tp,np,num);
+		if (tp[num]!=0 || c0==0)
+			{
+			for(i=0;i<num+2;i++)	vp[i] = 0;
+			return;
+			}
+		}
+	for(i=0;i<num;i++)	rp[i] = tp[i],	vp[i] = 0;
+	vp[num]   = 0;
+	vp[num+1] = 0;
+	}
+
+void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num)
+	{
+	bn_mul_mont(rp,ap,ap,np,n0,num);
+	}
+#endif /* OPENSSL_BN_ASM_MONT */
+
 #else /* !BN_MUL_COMBA */

 /* hmm... is it faster just to do a multiply? */
 #undef bn_sqr_comba4
-void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 	{
 	BN_ULONG t[8];
 	bn_sqr_normal(r,a,4,t);
 	}

 #undef bn_sqr_comba8
-void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 	{
 	BN_ULONG t[16];
 	bn_sqr_normal(r,a,8,t);
@ -857,4 +934,49 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
 	}

+#ifdef OPENSSL_BN_ASM_MONT
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num)
+	{
+	BN_ULONG c0,c1,*tp;
+	volatile BN_ULONG *vp;
+	int i=0,j;
+
+	vp = tp = alloca((num+2)*sizeof(BN_ULONG));
+
+	for(i=0;i<=num;i++)	tp[i]=0;
+
+	for(i=0;i<num;i++)
+		{
+		c0         = bn_mul_add_words(tp,ap,num,bp[i]);
+		c1         = tp[num] + c0;
+		tp[num]    = c1;
+		tp[num+1]  = (c1<c0?1:0);
+
+		c0         = bn_mul_add_words(tp,np,num,tp[0]*n0);
+		c1         = tp[num] + c0;
+		tp[num]    = c1;
+		tp[num+1] += (c1<c0?1:0);
+		for(j=0;j<=num;j++)	tp[j]=tp[j+1];
+		}
+
+	if (tp[num]!=0 || tp[num-1]>=np[num-1])
+		{
+		c0 = bn_sub_words(rp,tp,np,num);
+		if (tp[num]!=0 || c0==0)
+			{
+			for(i=0;i<num+2;i++)	vp[i] = 0;
+			return;
+			}
+		}
+	for(i=0;i<num;i++)	rp[i] = tp[i],	vp[i] = 0;
+	vp[num]   = 0;
+	vp[num+1] = 0;
+	}
+
+void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num)
+	{
+	bn_mul_mont(rp,ap,ap,np,n0,num);
+	}
+#endif /* OPENSSL_BN_ASM_MONT */
+
 #endif /* !BN_MUL_COMBA */
--- a/crypto/bn/bn_mont.c
+++ b/crypto/bn/bn_mont.c
@ -74,6 +74,22 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
 	{
 	BIGNUM *tmp;
 	int ret=0;
+#ifdef OPENSSL_BN_ASM_MONT
+	int num = mont->N.top;
+
+	if (num>1 && a->top==num && b->top==num)
+		{
+		if (bn_wexpand(r,num) == NULL) return 0;
+		r->neg = a->neg^b->neg;
+		r->top = num;
+		if (a==b)
+			bn_sqr_mont(r->d,a->d,mont->N.d,mont->n0,num);
+		else
+			bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num);
+		bn_fix_top(r);
+		return 1;
+		}
+#endif

 	BN_CTX_start(ctx);
 	tmp = BN_CTX_get(ctx);