crypto/bn/x86_64-mont5.pl: constant-time gather procedure.

At the same time remove miniscule bias in final subtraction.
Performance penalty varies from platform to platform, and even with
key length. For rsa2048 sign it was observed to be 4% for Sandy
Bridge and 7% on Broadwell.

CVE-2016-0702

Reviewed-by: Richard Levitte <levitte@openssl.org>
Reviewed-by: Rich Salz <rsalz@openssl.org>
(cherry picked from master)
This commit is contained in:
Andy Polyakov 2016-01-25 23:41:01 +01:00 committed by Matt Caswell
parent 08ea966c01
commit 25d14c6c29
3 changed files with 676 additions and 556 deletions

View File

@ -775,20 +775,20 @@ bn_sqr8x_mont:
# 4096. this is done to allow memory disambiguation logic # 4096. this is done to allow memory disambiguation logic
# do its job. # do its job.
# #
lea -64(%rsp,$num,4),%r11 lea -64(%rsp,$num,2),%r11
mov ($n0),$n0 # *n0 mov ($n0),$n0 # *n0
sub $aptr,%r11 sub $aptr,%r11
and \$4095,%r11 and \$4095,%r11
cmp %r11,%r10 cmp %r11,%r10
jb .Lsqr8x_sp_alt jb .Lsqr8x_sp_alt
sub %r11,%rsp # align with $aptr sub %r11,%rsp # align with $aptr
lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num) lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
jmp .Lsqr8x_sp_done jmp .Lsqr8x_sp_done
.align 32 .align 32
.Lsqr8x_sp_alt: .Lsqr8x_sp_alt:
lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num) lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
sub %r10,%r11 sub %r10,%r11
mov \$0,%r10 mov \$0,%r10
cmovc %r10,%r11 cmovc %r10,%r11
@ -798,37 +798,17 @@ bn_sqr8x_mont:
mov $num,%r10 mov $num,%r10
neg $num neg $num
lea 64(%rsp,$num,2),%r11 # copy of modulus
mov $n0, 32(%rsp) mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp mov %rax, 40(%rsp) # save original %rsp
.Lsqr8x_body: .Lsqr8x_body:
mov $num,$i movq $nptr, %xmm2 # save pointer to modulus
movq %r11, %xmm2 # save pointer to modulus copy
shr \$3+2,$i
mov OPENSSL_ia32cap_P+8(%rip),%eax
jmp .Lsqr8x_copy_n
.align 32
.Lsqr8x_copy_n:
movq 8*0($nptr),%xmm0
movq 8*1($nptr),%xmm1
movq 8*2($nptr),%xmm3
movq 8*3($nptr),%xmm4
lea 8*4($nptr),$nptr
movdqa %xmm0,16*0(%r11)
movdqa %xmm1,16*1(%r11)
movdqa %xmm3,16*2(%r11)
movdqa %xmm4,16*3(%r11)
lea 16*4(%r11),%r11
dec $i
jnz .Lsqr8x_copy_n
pxor %xmm0,%xmm0 pxor %xmm0,%xmm0
movq $rptr,%xmm1 # save $rptr movq $rptr,%xmm1 # save $rptr
movq %r10, %xmm3 # -$num movq %r10, %xmm3 # -$num
___ ___
$code.=<<___ if ($addx); $code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%eax
and \$0x80100,%eax and \$0x80100,%eax
cmp \$0x80100,%eax cmp \$0x80100,%eax
jne .Lsqr8x_nox jne .Lsqr8x_nox
@ -837,7 +817,6 @@ $code.=<<___ if ($addx);
pxor %xmm0,%xmm0 pxor %xmm0,%xmm0
lea 48(%rsp),%rax lea 48(%rsp),%rax
lea 64(%rsp,$num,2),%rdx
shr \$3+2,$num shr \$3+2,$num
mov 40(%rsp),%rsi # restore %rsp mov 40(%rsp),%rsi # restore %rsp
jmp .Lsqr8x_zero jmp .Lsqr8x_zero
@ -850,7 +829,6 @@ $code.=<<___;
pxor %xmm0,%xmm0 pxor %xmm0,%xmm0
lea 48(%rsp),%rax lea 48(%rsp),%rax
lea 64(%rsp,$num,2),%rdx
shr \$3+2,$num shr \$3+2,$num
mov 40(%rsp),%rsi # restore %rsp mov 40(%rsp),%rsi # restore %rsp
jmp .Lsqr8x_zero jmp .Lsqr8x_zero
@ -862,11 +840,6 @@ $code.=<<___;
movdqa %xmm0,16*2(%rax) movdqa %xmm0,16*2(%rax)
movdqa %xmm0,16*3(%rax) movdqa %xmm0,16*3(%rax)
lea 16*4(%rax),%rax lea 16*4(%rax),%rax
movdqa %xmm0,16*0(%rdx) # wipe n
movdqa %xmm0,16*1(%rdx)
movdqa %xmm0,16*2(%rdx)
movdqa %xmm0,16*3(%rdx)
lea 16*4(%rdx),%rdx
dec $num dec $num
jnz .Lsqr8x_zero jnz .Lsqr8x_zero

File diff suppressed because it is too large Load Diff

View File

@ -788,8 +788,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
if (window >= 5) { if (window >= 5) {
window = 5; /* ~5% improvement for RSA2048 sign, and even window = 5; /* ~5% improvement for RSA2048 sign, and even
* for RSA4096 */ * for RSA4096 */
if ((top & 7) == 0) /* reserve space for mont->N.d[] copy */
powerbufLen += 2 * top * sizeof(m->d[0]); powerbufLen += top * sizeof(mont->N.d[0]);
} }
#endif #endif
(void)0; (void)0;
@ -1010,7 +1010,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
const BN_ULONG *not_used, const BN_ULONG *np, const BN_ULONG *not_used, const BN_ULONG *np,
const BN_ULONG *n0, int num); const BN_ULONG *n0, int num);
BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2; BN_ULONG *n0 = mont->n0, *np;
/* /*
* BN_to_montgomery can contaminate words above .top [in * BN_to_montgomery can contaminate words above .top [in
@ -1021,11 +1021,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
for (i = tmp.top; i < top; i++) for (i = tmp.top; i < top; i++)
tmp.d[i] = 0; tmp.d[i] = 0;
if (top & 7) /*
np2 = np; * copy mont->N.d[] to improve cache locality
else */
for (np2 = am.d + top, i = 0; i < top; i++) for (np = am.d + top, i = 0; i < top; i++)
np2[2 * i] = np[i]; np[i] = mont->N.d[i];
bn_scatter5(tmp.d, top, powerbuf, 0); bn_scatter5(tmp.d, top, powerbuf, 0);
bn_scatter5(am.d, am.top, powerbuf, 1); bn_scatter5(am.d, am.top, powerbuf, 1);
@ -1035,7 +1035,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
# if 0 # if 0
for (i = 3; i < 32; i++) { for (i = 3; i < 32; i++) {
/* Calculate a^i = a^(i-1) * a */ /* Calculate a^i = a^(i-1) * a */
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i); bn_scatter5(tmp.d, top, powerbuf, i);
} }
# else # else
@ -1046,7 +1046,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
} }
for (i = 3; i < 8; i += 2) { for (i = 3; i < 8; i += 2) {
int j; int j;
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i); bn_scatter5(tmp.d, top, powerbuf, i);
for (j = 2 * i; j < 32; j *= 2) { for (j = 2 * i; j < 32; j *= 2) {
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
@ -1054,13 +1054,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
} }
} }
for (; i < 16; i += 2) { for (; i < 16; i += 2) {
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i); bn_scatter5(tmp.d, top, powerbuf, i);
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
bn_scatter5(tmp.d, top, powerbuf, 2 * i); bn_scatter5(tmp.d, top, powerbuf, 2 * i);
} }
for (; i < 32; i += 2) { for (; i < 32; i += 2) {
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i); bn_scatter5(tmp.d, top, powerbuf, i);
} }
# endif # endif
@ -1089,11 +1089,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
while (bits >= 0) { while (bits >= 0) {
wvalue = bn_get_bits5(p->d, bits - 4); wvalue = bn_get_bits5(p->d, bits - 4);
bits -= 5; bits -= 5;
bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue); bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
} }
} }
ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top); ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
tmp.top = top; tmp.top = top;
bn_correct_top(&tmp); bn_correct_top(&tmp);
if (ret) { if (ret) {