bn/asm/x86_64-mont5.pl: add MULX/AD*X code path.

This also eliminates code duplication between x86_64-mont and x86_64-mont
and optimizes even original non-MULX code.
This commit is contained in:
Andy Polyakov 2013-12-09 21:02:24 +01:00
parent d1671f4f1a
commit ec9cc70f72
3 changed files with 2651 additions and 2181 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -726,7 +726,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
else
#endif
#if defined(OPENSSL_BN_ASM_MONT5)
if (window==6 && bits<=1024) window=5; /* ~5% improvement of 2048-bit RSA sign */
if (window>=5)
{
window=5; /* ~5% improvement for RSA2048 sign, and even for RSA4096 */
if ((top&7)==0) powerbufLen += 2*top*sizeof(m->d[0]);
}
#endif
(void)0;
@ -734,7 +738,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
* powers of am, am itself and tmp.
*/
numPowers = 1 << window;
powerbufLen = sizeof(m->d[0])*(top*numPowers +
powerbufLen += sizeof(m->d[0])*(top*numPowers +
((2*top)>numPowers?(2*top):numPowers));
#ifdef alloca
if (powerbufLen < 3072)
@ -912,14 +916,26 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
void *table,size_t power);
void bn_gather5(BN_ULONG *out,size_t num,
void *table,size_t power);
void bn_power5(BN_ULONG *rp,const BN_ULONG *ap,
const void *table,const BN_ULONG *np,
const BN_ULONG *n0,int num,int power);
int bn_get_bits5(const BN_ULONG *ap,int off);
int bn_from_montgomery(BN_ULONG *rp,const BN_ULONG *ap,
const BN_ULONG *not_used,const BN_ULONG *np,
const BN_ULONG *n0,int num);
BN_ULONG *np=mont->N.d, *n0=mont->n0;
BN_ULONG *np=mont->N.d, *n0=mont->n0, *np2;
/* BN_to_montgomery can contaminate words above .top
* [in BN_DEBUG[_DEBUG] build]... */
for (i=am.top; i<top; i++) am.d[i]=0;
for (i=tmp.top; i<top; i++) tmp.d[i]=0;
if (top&7)
np2 = np;
else
for (np2=am.d+top,i=0; i<top; i++) np2[2*i]=np[i];
bn_scatter5(tmp.d,top,powerbuf,0);
bn_scatter5(am.d,am.top,powerbuf,1);
bn_mul_mont(tmp.d,am.d,am.d,np,n0,top);
@ -929,7 +945,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
for (i=3; i<32; i++)
{
/* Calculate a^i = a^(i-1) * a */
bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np2,n0,top,i-1);
bn_scatter5(tmp.d,top,powerbuf,i);
}
#else
@ -942,7 +958,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
for (i=3; i<8; i+=2)
{
int j;
bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np2,n0,top,i-1);
bn_scatter5(tmp.d,top,powerbuf,i);
for (j=2*i; j<32; j*=2)
{
@ -952,14 +968,14 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
for (; i<16; i+=2)
{
bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np2,n0,top,i-1);
bn_scatter5(tmp.d,top,powerbuf,i);
bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
bn_scatter5(tmp.d,top,powerbuf,2*i);
}
for (; i<32; i+=2)
{
bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np2,n0,top,i-1);
bn_scatter5(tmp.d,top,powerbuf,i);
}
#endif
@ -971,7 +987,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
/* Scan the exponent one window at a time starting from the most
* significant bits.
*/
while (bits >= 0)
if (top&7)
while (bits >= 0)
{
for (wvalue=0, i=0; i<5; i++,bits--)
wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
@ -983,9 +1000,24 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
}
else
{
while (bits >= 0)
{
wvalue = bn_get_bits5(p->d,bits-4);
bits-=5;
bn_power5(tmp.d,tmp.d,powerbuf,np2,n0,top,wvalue);
}
}
ret=bn_from_montgomery(tmp.d,tmp.d,NULL,np2,n0,top);
tmp.top=top;
bn_correct_top(&tmp);
if (ret)
{
if (!BN_copy(rr,&tmp)) ret=0;
goto err; /* non-zero ret means it's not error */
}
}
else
#endif