diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h index 2688684b6..c296e10d2 100644 --- a/crypto/bn/bn.h +++ b/crypto/bn/bn.h @@ -727,6 +727,8 @@ int RAND_pseudo_bytes(unsigned char *buf,int num); bn_pollute(a); \ } +void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num); +void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num); BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w); BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w); void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num); diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c index 99bc2de49..52af96d36 100644 --- a/crypto/bn/bn_asm.c +++ b/crypto/bn/bn_asm.c @@ -820,18 +820,95 @@ void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) r[6]=c1; r[7]=c2; } + +#ifdef OPENSSL_BN_ASM_MONT +/* + * This is essentially reference implementation, which may or may not + * result in performance improvement. E.g. on IA-32 this does give 40% + * faster rsa1024 private key operations and 10% faster rsa4096 ones, + * while on AMD64 it improves rsa1024 sign only by 10%, but *worsens* + * rsa4096 sign by 15%. Once again, it's a reference implementation, + * one to be used as start-point for platform-specific assembler. + */ +void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num) + { + BN_ULONG c0,c1,ml,*tp; +#ifdef mul64 + BN_ULONG mh; +#endif + volatile BN_ULONG *vp; + int i=0,j; + + vp = tp = alloca((num+2)*sizeof(BN_ULONG)); + + tp[num] = bn_mul_words(tp,ap,num,bp[0]); + tp[num+1] = 0; + goto enter; + + for(i=0;i=np[num-1]) + { + c0 = bn_sub_words(rp,tp,np,num); + if (tp[num]!=0 || c0==0) + { + for(i=0;i=np[num-1]) + { + c0 = bn_sub_words(rp,tp,np,num); + if (tp[num]!=0 || c0==0) + { + for(i=0;iN.top; + + if (num>1 && a->top==num && b->top==num) + { + if (bn_wexpand(r,num) == NULL) return 0; + r->neg = a->neg^b->neg; + r->top = num; + if (a==b) + bn_sqr_mont(r->d,a->d,mont->N.d,mont->n0,num); + else + bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num); + bn_fix_top(r); + return 1; + } +#endif BN_CTX_start(ctx); tmp = BN_CTX_get(ctx);