crypto/bn/x86_64-mont5.pl: constant-time gather procedure.
At the same time remove miniscule bias in final subtraction. Performance penalty varies from platform to platform, and even with key length. For rsa2048 sign it was observed to be 4% for Sandy Bridge and 7% on Broadwell. CVE-2016-0702 Reviewed-by: Richard Levitte <levitte@openssl.org> Reviewed-by: Rich Salz <rsalz@openssl.org> (cherry picked from master)
This commit is contained in:
		
				
					committed by
					
						
						Matt Caswell
					
				
			
			
				
	
			
			
			
						parent
						
							08ea966c01
						
					
				
				
					commit
					25d14c6c29
				
			@@ -775,20 +775,20 @@ bn_sqr8x_mont:
 | 
			
		||||
	# 4096. this is done to allow memory disambiguation logic
 | 
			
		||||
	# do its job.
 | 
			
		||||
	#
 | 
			
		||||
	lea	-64(%rsp,$num,4),%r11
 | 
			
		||||
	lea	-64(%rsp,$num,2),%r11
 | 
			
		||||
	mov	($n0),$n0		# *n0
 | 
			
		||||
	sub	$aptr,%r11
 | 
			
		||||
	and	\$4095,%r11
 | 
			
		||||
	cmp	%r11,%r10
 | 
			
		||||
	jb	.Lsqr8x_sp_alt
 | 
			
		||||
	sub	%r11,%rsp		# align with $aptr
 | 
			
		||||
	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
 | 
			
		||||
	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
 | 
			
		||||
	jmp	.Lsqr8x_sp_done
 | 
			
		||||
 | 
			
		||||
.align	32
 | 
			
		||||
.Lsqr8x_sp_alt:
 | 
			
		||||
	lea	4096-64(,$num,4),%r10	# 4096-frame-4*$num
 | 
			
		||||
	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
 | 
			
		||||
	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
 | 
			
		||||
	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
 | 
			
		||||
	sub	%r10,%r11
 | 
			
		||||
	mov	\$0,%r10
 | 
			
		||||
	cmovc	%r10,%r11
 | 
			
		||||
@@ -798,37 +798,17 @@ bn_sqr8x_mont:
 | 
			
		||||
	mov	$num,%r10	
 | 
			
		||||
	neg	$num
 | 
			
		||||
 | 
			
		||||
	lea	64(%rsp,$num,2),%r11	# copy of modulus
 | 
			
		||||
	mov	$n0,  32(%rsp)
 | 
			
		||||
	mov	%rax, 40(%rsp)		# save original %rsp
 | 
			
		||||
.Lsqr8x_body:
 | 
			
		||||
 | 
			
		||||
	mov	$num,$i
 | 
			
		||||
	movq	%r11, %xmm2		# save pointer to modulus copy
 | 
			
		||||
	shr	\$3+2,$i
 | 
			
		||||
	mov	OPENSSL_ia32cap_P+8(%rip),%eax
 | 
			
		||||
	jmp	.Lsqr8x_copy_n
 | 
			
		||||
 | 
			
		||||
.align	32
 | 
			
		||||
.Lsqr8x_copy_n:
 | 
			
		||||
	movq	8*0($nptr),%xmm0
 | 
			
		||||
	movq	8*1($nptr),%xmm1
 | 
			
		||||
	movq	8*2($nptr),%xmm3
 | 
			
		||||
	movq	8*3($nptr),%xmm4
 | 
			
		||||
	lea	8*4($nptr),$nptr
 | 
			
		||||
	movdqa	%xmm0,16*0(%r11)
 | 
			
		||||
	movdqa	%xmm1,16*1(%r11)
 | 
			
		||||
	movdqa	%xmm3,16*2(%r11)
 | 
			
		||||
	movdqa	%xmm4,16*3(%r11)
 | 
			
		||||
	lea	16*4(%r11),%r11
 | 
			
		||||
	dec	$i
 | 
			
		||||
	jnz	.Lsqr8x_copy_n
 | 
			
		||||
 | 
			
		||||
	movq	$nptr, %xmm2		# save pointer to modulus
 | 
			
		||||
	pxor	%xmm0,%xmm0
 | 
			
		||||
	movq	$rptr,%xmm1		# save $rptr
 | 
			
		||||
	movq	%r10, %xmm3		# -$num
 | 
			
		||||
___
 | 
			
		||||
$code.=<<___ if ($addx);
 | 
			
		||||
	mov	OPENSSL_ia32cap_P+8(%rip),%eax
 | 
			
		||||
	and	\$0x80100,%eax
 | 
			
		||||
	cmp	\$0x80100,%eax
 | 
			
		||||
	jne	.Lsqr8x_nox
 | 
			
		||||
@@ -837,7 +817,6 @@ $code.=<<___ if ($addx);
 | 
			
		||||
 | 
			
		||||
	pxor	%xmm0,%xmm0
 | 
			
		||||
	lea	48(%rsp),%rax
 | 
			
		||||
	lea	64(%rsp,$num,2),%rdx
 | 
			
		||||
	shr	\$3+2,$num
 | 
			
		||||
	mov	40(%rsp),%rsi		# restore %rsp
 | 
			
		||||
	jmp	.Lsqr8x_zero
 | 
			
		||||
@@ -850,7 +829,6 @@ $code.=<<___;
 | 
			
		||||
 | 
			
		||||
	pxor	%xmm0,%xmm0
 | 
			
		||||
	lea	48(%rsp),%rax
 | 
			
		||||
	lea	64(%rsp,$num,2),%rdx
 | 
			
		||||
	shr	\$3+2,$num
 | 
			
		||||
	mov	40(%rsp),%rsi		# restore %rsp
 | 
			
		||||
	jmp	.Lsqr8x_zero
 | 
			
		||||
@@ -862,11 +840,6 @@ $code.=<<___;
 | 
			
		||||
	movdqa	%xmm0,16*2(%rax)
 | 
			
		||||
	movdqa	%xmm0,16*3(%rax)
 | 
			
		||||
	lea	16*4(%rax),%rax
 | 
			
		||||
	movdqa	%xmm0,16*0(%rdx)	# wipe n
 | 
			
		||||
	movdqa	%xmm0,16*1(%rdx)
 | 
			
		||||
	movdqa	%xmm0,16*2(%rdx)
 | 
			
		||||
	movdqa	%xmm0,16*3(%rdx)
 | 
			
		||||
	lea	16*4(%rdx),%rdx
 | 
			
		||||
	dec	$num
 | 
			
		||||
	jnz	.Lsqr8x_zero
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -788,8 +788,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 | 
			
		||||
    if (window >= 5) {
 | 
			
		||||
        window = 5;             /* ~5% improvement for RSA2048 sign, and even
 | 
			
		||||
                                 * for RSA4096 */
 | 
			
		||||
        if ((top & 7) == 0)
 | 
			
		||||
            powerbufLen += 2 * top * sizeof(m->d[0]);
 | 
			
		||||
        /* reserve space for mont->N.d[] copy */
 | 
			
		||||
        powerbufLen += top * sizeof(mont->N.d[0]);
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
    (void)0;
 | 
			
		||||
@@ -1010,7 +1010,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 | 
			
		||||
                               const BN_ULONG *not_used, const BN_ULONG *np,
 | 
			
		||||
                               const BN_ULONG *n0, int num);
 | 
			
		||||
 | 
			
		||||
        BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
 | 
			
		||||
        BN_ULONG *n0 = mont->n0, *np;
 | 
			
		||||
 | 
			
		||||
        /*
 | 
			
		||||
         * BN_to_montgomery can contaminate words above .top [in
 | 
			
		||||
@@ -1021,11 +1021,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 | 
			
		||||
        for (i = tmp.top; i < top; i++)
 | 
			
		||||
            tmp.d[i] = 0;
 | 
			
		||||
 | 
			
		||||
        if (top & 7)
 | 
			
		||||
            np2 = np;
 | 
			
		||||
        else
 | 
			
		||||
            for (np2 = am.d + top, i = 0; i < top; i++)
 | 
			
		||||
                np2[2 * i] = np[i];
 | 
			
		||||
        /*
 | 
			
		||||
         * copy mont->N.d[] to improve cache locality
 | 
			
		||||
         */
 | 
			
		||||
        for (np = am.d + top, i = 0; i < top; i++)
 | 
			
		||||
            np[i] = mont->N.d[i];
 | 
			
		||||
 | 
			
		||||
        bn_scatter5(tmp.d, top, powerbuf, 0);
 | 
			
		||||
        bn_scatter5(am.d, am.top, powerbuf, 1);
 | 
			
		||||
@@ -1035,7 +1035,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 | 
			
		||||
# if 0
 | 
			
		||||
        for (i = 3; i < 32; i++) {
 | 
			
		||||
            /* Calculate a^i = a^(i-1) * a */
 | 
			
		||||
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
 | 
			
		||||
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
 | 
			
		||||
            bn_scatter5(tmp.d, top, powerbuf, i);
 | 
			
		||||
        }
 | 
			
		||||
# else
 | 
			
		||||
@@ -1046,7 +1046,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 | 
			
		||||
        }
 | 
			
		||||
        for (i = 3; i < 8; i += 2) {
 | 
			
		||||
            int j;
 | 
			
		||||
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
 | 
			
		||||
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
 | 
			
		||||
            bn_scatter5(tmp.d, top, powerbuf, i);
 | 
			
		||||
            for (j = 2 * i; j < 32; j *= 2) {
 | 
			
		||||
                bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
 | 
			
		||||
@@ -1054,13 +1054,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        for (; i < 16; i += 2) {
 | 
			
		||||
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
 | 
			
		||||
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
 | 
			
		||||
            bn_scatter5(tmp.d, top, powerbuf, i);
 | 
			
		||||
            bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
 | 
			
		||||
            bn_scatter5(tmp.d, top, powerbuf, 2 * i);
 | 
			
		||||
        }
 | 
			
		||||
        for (; i < 32; i += 2) {
 | 
			
		||||
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
 | 
			
		||||
            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
 | 
			
		||||
            bn_scatter5(tmp.d, top, powerbuf, i);
 | 
			
		||||
        }
 | 
			
		||||
# endif
 | 
			
		||||
@@ -1089,11 +1089,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 | 
			
		||||
            while (bits >= 0) {
 | 
			
		||||
                wvalue = bn_get_bits5(p->d, bits - 4);
 | 
			
		||||
                bits -= 5;
 | 
			
		||||
                bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
 | 
			
		||||
                bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
 | 
			
		||||
        ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
 | 
			
		||||
        tmp.top = top;
 | 
			
		||||
        bn_correct_top(&tmp);
 | 
			
		||||
        if (ret) {
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user