Optimize SPARC T4 MONTMUL support.

Improve RSA sing performance by 20-30% by:
- switching from floating-point to integer conditional moves;
- daisy-chaining sqr-sqr-sqr-sqr-sqr-mul sequences;
- using MONTMUL even during powers table setup;
(cherry picked from commit 4ddacd9921f48013b5cd011e4b93b294c14db1c2)
This commit is contained in:
Andy Polyakov 2013-06-18 10:39:38 +02:00
parent e887c4187e
commit 03c8441b14
2 changed files with 253 additions and 155 deletions

View File

@ -29,36 +29,36 @@
# #
# 64-bit process, VIS3: # 64-bit process, VIS3:
# sign verify sign/s verify/s # sign verify sign/s verify/s
# rsa 1024 bits 0.000633s 0.000033s 1578.9 30513.3 # rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
# rsa 2048 bits 0.003297s 0.000116s 303.3 8585.8 # rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
# rsa 4096 bits 0.026000s 0.000387s 38.5 2587.0 # rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
# dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9 # dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
# dsa 2048 bits 0.001056s 0.001233s 946.9 810.8 # dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
# #
# 64-bit process, this module: # 64-bit process, this module:
# sign verify sign/s verify/s # sign verify sign/s verify/s
# rsa 1024 bits 0.000341s 0.000021s 2931.5 46873.8 # rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
# rsa 2048 bits 0.001244s 0.000044s 803.9 22569.1 # rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
# rsa 4096 bits 0.006203s 0.000387s 161.2 2586.3 # rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
# dsa 1024 bits 0.000179s 0.000195s 5573.9 5115.6 # dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
# dsa 2048 bits 0.000311s 0.000350s 3212.3 2856.6 # dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
# #
###################################################################### ######################################################################
# 32-bit process, VIS3: # 32-bit process, VIS3:
# sign verify sign/s verify/s # sign verify sign/s verify/s
# rsa 1024 bits 0.000675s 0.000033s 1480.9 30159.0 # rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
# rsa 2048 bits 0.003383s 0.000118s 295.6 8499.9 # rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
# rsa 4096 bits 0.026178s 0.000394s 38.2 2541.3 # rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
# dsa 1024 bits 0.000326s 0.000343s 3070.0 2918.8 # dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
# dsa 2048 bits 0.001121s 0.001291s 891.9 774.4 # dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
# #
# 32-bit process, this module: # 32-bit process, this module:
# sign verify sign/s verify/s # sign verify sign/s verify/s
# rsa 1024 bits 0.000386s 0.000022s 2589.6 45704.9 # rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
# rsa 2048 bits 0.001335s 0.000046s 749.3 21766.8 # rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
# rsa 4096 bits 0.006390s 0.000393s 156.5 2544.8 # rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
# dsa 1024 bits 0.000208s 0.000204s 4817.6 4896.6 # dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
# dsa 2048 bits 0.000345s 0.000364s 2898.8 2747.3 # dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
# #
# 32-bit code is prone to performance degradation as interrupt rate # 32-bit code is prone to performance degradation as interrupt rate
# dispatched to CPU executing the code grows. This is because in # dispatched to CPU executing the code grows. This is because in
@ -69,7 +69,7 @@
# hardly measurable. But in order to mitigate this problem for higher # hardly measurable. But in order to mitigate this problem for higher
# interrupt rates contemporary Linux kernel recognizes biased stack # interrupt rates contemporary Linux kernel recognizes biased stack
# even in 32-bit process context and preserves full register contents. # even in 32-bit process context and preserves full register contents.
# See http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;h=517ffce4e1a03aea979fe3a18a3dd1761a24fafb # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
# for details. # for details.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
@ -98,8 +98,8 @@ ___
# #
my @R=map("%f".2*$_,(0..11,30,31,12..29)); my @R=map("%f".2*$_,(0..11,30,31,12..29));
my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]); my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
my @B=(map("%o$_",(0..5)),@N[0..13],@N[0..11]);
my @A=(@N[0..13],@R[14..31]); my @A=(@N[0..13],@R[14..31]);
my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
######################################################################## ########################################################################
# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp, # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
@ -223,20 +223,11 @@ $code.=<<___;
___ ___
# load bp[$NUM] ######################################################## # load bp[$NUM] ########################################################
for($i=0; $i<6 && $i<$NUM; $i++) {
my $lo=$i<5?@B[$i+1]:"%o7";
$code.=<<___;
ld [$bp+$i*8+0],$lo
ld [$bp+$i*8+4],@B[$i]
sllx @B[$i],32,@B[$i]
or $lo,@B[$i],@B[$i]
___
}
$code.=<<___; $code.=<<___;
save %sp,-128,%sp; or $sentinel,%fp,%fp save %sp,-128,%sp; or $sentinel,%fp,%fp
___ ___
for(; $i<20 && $i<$NUM; $i++) { for($i=0; $i<14 && $i<$NUM; $i++) {
my $lo=$i<19?@B[$i+1]:"%o7"; my $lo=$i<13?@B[$i+1]:"%o7";
$code.=<<___; $code.=<<___;
ld [$bp+$i*8+0],$lo ld [$bp+$i*8+0],$lo
ld [$bp+$i*8+4],@B[$i] ld [$bp+$i*8+4],@B[$i]
@ -355,58 +346,84 @@ for ($i=8;$i<=32;$i+=8) {
######################################################################## ########################################################################
# #
sub load_fcc() { sub load_ccr {
my ($ptbl,$pwr,$tmp)=@_; my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
$code.=<<___; $code.=<<___;
sethi %hi(.Lmagic-1f),$tmp srl $pwr, 2, %o4
1: call .+8 and $pwr, 3, %o5
add %o7, $tmp, %o7 and %o4, 7, %o4
inc %lo(.Lmagic-1b),%o7 sll %o5, 3, %o5 ! offset within first cache line
and $pwr, 7<<2, $tmp ! offset within "magic table" add %o5, $ptbl, $ptbl ! of the pwrtbl
add $tmp, %o7, %o7 or %g0, 1, %o5
and $pwr, 3, $tmp sll %o5, %o4, $ccr
sll $tmp, 3, $tmp ! offset within first cache line ___
add $tmp, $ptbl, $ptbl ! of the pwrtbl $code.=<<___ if (!$skip_wr);
wr $ccr, %g0, %ccr
! "magic table" is organized in such way that below comparisons
! make %fcc3:%fcc2:%fcc1:%fcc0 form a byte of 1s with one 0,
! e.g. 0b11011111, with 0 denoting relevant cache line.
ld [%o7+0], %f0 ! load column
ld [%o7+32], %f1
ld [%o7+64], %f2
fcmps %fcc0, %f0, %f1
ld [%o7+96], %f3
fcmps %fcc1, %f1, %f2
fcmps %fcc2, %f2, %f3
fcmps %fcc3, %f3, %f0
___ ___
} }
sub load_f16() { sub load_b_pair {
my $ptbl=shift; my ($pwrtbl,$B0,$B1)=@_;
$code.=<<___; $code.=<<___;
ldd [$ptbl+0*32],%f0 ! load all cache lines ldx [$pwrtbl+0*32], $B0
ldd [$ptbl+1*32],%f2 ldx [$pwrtbl+8*32], $B1
ldd [$ptbl+2*32],%f4 ldx [$pwrtbl+1*32], %o4
fmovdg %fcc0,%f0,%f16 ! pick one value ldx [$pwrtbl+9*32], %o5
ldd [$ptbl+3*32],%f6 movvs %icc, %o4, $B0
fmovdl %fcc0,%f2,%f16 ldx [$pwrtbl+2*32], %o4
ldd [$ptbl+4*32],%f8 movvs %icc, %o5, $B1
fmovdg %fcc1,%f4,%f16 ldx [$pwrtbl+10*32],%o5
ldd [$ptbl+5*32],%f10 move %icc, %o4, $B0
fmovdl %fcc1,%f6,%f16 ldx [$pwrtbl+3*32], %o4
ldd [$ptbl+6*32],%f12 move %icc, %o5, $B1
fmovdg %fcc2,%f8,%f16 ldx [$pwrtbl+11*32],%o5
ldd [$ptbl+7*32],%f14 movneg %icc, %o4, $B0
fmovdl %fcc2,%f10,%f16 ldx [$pwrtbl+4*32], %o4
fmovdg %fcc3,%f12,%f16 movneg %icc, %o5, $B1
fmovdl %fcc3,%f14,%f16 ldx [$pwrtbl+12*32],%o5
add $ptbl,8*32,$ptbl movcs %xcc, %o4, $B0
ldx [$pwrtbl+5*32],%o4
movcs %xcc, %o5, $B1
ldx [$pwrtbl+13*32],%o5
movvs %xcc, %o4, $B0
ldx [$pwrtbl+6*32], %o4
movvs %xcc, %o5, $B1
ldx [$pwrtbl+14*32],%o5
move %xcc, %o4, $B0
ldx [$pwrtbl+7*32], %o4
move %xcc, %o5, $B1
ldx [$pwrtbl+15*32],%o5
movneg %xcc, %o4, $B0
add $pwrtbl,16*32, $pwrtbl
movneg %xcc, %o5, $B1
___
}
sub load_b {
my ($pwrtbl,$Bi)=@_;
$code.=<<___;
ldx [$pwrtbl+0*32], $Bi
ldx [$pwrtbl+1*32], %o4
ldx [$pwrtbl+2*32], %o5
movvs %icc, %o4, $Bi
ldx [$pwrtbl+3*32], %o4
move %icc, %o5, $Bi
ldx [$pwrtbl+4*32], %o5
movneg %icc, %o4, $Bi
ldx [$pwrtbl+5*32], %o4
movcs %xcc, %o5, $Bi
ldx [$pwrtbl+6*32], %o5
movvs %xcc, %o4, $Bi
ldx [$pwrtbl+7*32], %o4
move %xcc, %o5, $Bi
add $pwrtbl,8*32, $pwrtbl
movneg %xcc, %o4, $Bi
___ ___
} }
######################################################################## ########################################################################
# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0, # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
# const u64 *pwrtbl,int pwr); # const u64 *pwrtbl,int pwr,int stride);
# #
sub generate_bn_pwr5_mont_t4() { sub generate_bn_pwr5_mont_t4() {
my $NUM=shift; my $NUM=shift;
@ -457,7 +474,9 @@ bn_pwr5_mont_t4_$NUM:
ld [%i2+0],%f1 ! load *n0 ld [%i2+0],%f1 ! load *n0
ld [%i2+4],%f0 ld [%i2+4],%f0
mov %i3,$pwrtbl mov %i3,$pwrtbl
mov %i4,$pwr srl %i4,%g0,%i4 ! pack last arguments
sllx %i5,32,$pwr
or %i4,$pwr,$pwr
fsrc2 %f0,%f60 fsrc2 %f0,%f60
___ ___
@ -501,31 +520,43 @@ $code.=<<___;
___ ___
} }
# load pwrtbl[pwr] ######################################################## # load pwrtbl[pwr] ########################################################
&load_fcc($pwrtbl,$pwr,@B[0]);
for($i=0; $i<6 && $i<$NUM; $i++) {
&load_f16($pwrtbl);
$code.=<<___; $code.=<<___;
movdtox %f16,@B[$i] save %sp,-128,%sp; or $sentinel,%fp,%fp
srlx $pwr, 32, %o4 ! unpack $pwr
srl $pwr, %g0, %o5
sub %o4, 5, %o4
mov $pwrtbl, %o7
sllx %o4, 32, $pwr ! re-pack $pwr
or %o5, $pwr, $pwr
srl %o5, %o4, %o5
___ ___
&load_ccr("%o7","%o5","%o4");
$code.=<<___;
b .Lstride_$NUM
nop
.align 16
.Lstride_$NUM:
___
for($i=0; $i<14 && $i<$NUM; $i+=2) {
&load_b_pair("%o7",@B[$i],@B[$i+1]);
} }
$code.=<<___; $code.=<<___;
save %sp,-128,%sp; or $sentinel,%fp,%fp save %sp,-128,%sp; or $sentinel,%fp,%fp
___ ___
for(; $i<20 && $i<$NUM; $i++) { for(; $i<$NUM; $i+=2) {
&load_f16($pwrtbl); &load_b_pair("%i7",@B[$i],@B[$i+1]);
$code.=<<___;
movdtox %f16,@B[$i]
___
} }
$code.=<<___; $code.=<<___;
save %sp,-128,%sp; or $sentinel,%fp,%fp srax $pwr, 32, %o4 ! unpack $pwr
srl $pwr, %g0, %o5
sub %o4, 5, %o4
mov $pwrtbl, %i7
sllx %o4, 32, $pwr ! re-pack $pwr
or %o5, $pwr, $pwr
srl %o5, %o4, %o5
___ ___
for(; $i<$NUM; $i++) { &load_ccr("%i7","%o5","%o4",1);
&load_f16($pwrtbl);
$code.=<<___;
movdtox %f16,@B[$i]
___
}
# magic ################################################################ # magic ################################################################
for($i=0; $i<5; $i++) { for($i=0; $i<5; $i++) {
@ -540,21 +571,24 @@ $code.=<<___;
___ ___
} }
$code.=<<___; $code.=<<___;
wr %o4, %g0, %ccr
.word 0x81b02920+$NUM-1 ! montmul $NUM-1 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
fbu,pn %fcc3,.Labort_$NUM fbu,pn %fcc3,.Labort_$NUM
#ifndef __arch64__ #ifndef __arch64__
and %fp,$sentinel,$sentinel and %fp,$sentinel,$sentinel
brz,pn $sentinel,.Labort_$NUM brz,pn $sentinel,.Labort_$NUM
#endif #endif
nop
srax $pwr, 32, %o4
#ifdef __arch64__ #ifdef __arch64__
brgez %o4,.Lstride_$NUM
restore restore
restore restore
restore restore
restore restore
restore restore
#else #else
brgez %o4,.Lstride_$NUM
restore; and %fp,$sentinel,$sentinel restore; and %fp,$sentinel,$sentinel
restore; and %fp,$sentinel,$sentinel restore; and %fp,$sentinel,$sentinel
restore; and %fp,$sentinel,$sentinel restore; and %fp,$sentinel,$sentinel
@ -903,13 +937,12 @@ ___
# +-------------------------------+<----- aligned at 64 bytes # +-------------------------------+<----- aligned at 64 bytes
# . . # . .
($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7)); ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
($ovf,$i)=($t0,$t1); ($ovf,$i)=($t0,$t1);
&load_fcc($bp,"%g4","%g1"); &load_ccr($bp,"%g4",$ccr);
&load_f16($bp); &load_b($bp,$m0,"%o7"); # m0=bp[0]
$code.=<<___;
movdtox %f16, $m0 ! m0=bp[0]
$code.=<<___;
ld [$n0p+0], $t0 ! pull n0[0..1] value ld [$n0p+0], $t0 ! pull n0[0..1] value
ld [$n0p+4], $t1 ld [$n0p+4], $t1
add %sp, STACK_BIAS+STACK_FRAME, $tp add %sp, STACK_BIAS+STACK_FRAME, $tp
@ -990,11 +1023,10 @@ $code.=<<___;
.align 16 .align 16
.Louter_g5: .Louter_g5:
wr $ccr, %g0, %ccr
___ ___
&load_f16($bp); &load_b($bp,$m0); # m0=bp[i]
$code.=<<___; $code.=<<___;
movdtox %f16, $m0 ! m0=bp[i]
sub $ap, $num, $ap ! rewind sub $ap, $num, $ap ! rewind
sub $np, $num, $np sub $np, $num, $np
sub $tp, $num, $tp sub $tp, $num, $tp
@ -1138,39 +1170,40 @@ bn_flip_t4:
.type bn_flip_t4, #function .type bn_flip_t4, #function
.size bn_flip_t4, .-bn_flip_t4 .size bn_flip_t4, .-bn_flip_t4
.globl bn_scatter5_t4 .globl bn_flip_n_scatter5_t4
.align 32 .align 32
bn_scatter5_t4: bn_flip_n_scatter5_t4:
sll %o3, 3, %o3 sll %o3, 3, %o3
sub %o1, 1, %o1 srl %o1, 1, %o1
add %o3, %o2, %o2 ! &pwrtbl[pwr] add %o3, %o2, %o2 ! &pwrtbl[pwr]
nop sub %o1, 1, %o1
.Loop_scatter5: .Loop_flip_n_scatter5:
ldx [%o0], %g1 ! inp[i] ld [%o0+0], %o4 ! inp[i]
ld [%o0+4], %o5
add %o0, 8, %o0 add %o0, 8, %o0
stx %g1, [%o2] sllx %o5, 32, %o5
or %o4, %o5, %o5
stx %o5, [%o2]
add %o2, 32*8, %o2 add %o2, 32*8, %o2
brnz %o1, .Loop_scatter5 brnz %o1, .Loop_flip_n_scatter5
sub %o1, 1, %o1 sub %o1, 1, %o1
retl retl
nop nop
.type bn_scatter5_t4, #function .type bn_flip_n_scatter5_t4, #function
.size bn_scatter5_t4, .-bn_scatter5_t4 .size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
.globl bn_gather5_t4 .globl bn_gather5_t4
.align 32 .align 32
bn_gather5_t4: bn_gather5_t4:
mov %o7, %o5
___ ___
&load_fcc("%o2","%o3","%o4"); &load_ccr("%o2","%o3","%g1");
$code.=<<___; $code.=<<___;
mov %o5, %o7
sub %o1, 1, %o1 sub %o1, 1, %o1
.Loop_gather5: .Loop_gather5:
___ ___
&load_f16("%o2"); &load_b("%o2","%g1");
$code.=<<___; $code.=<<___;
std %f16, [%o0] stx %g1, [%o0]
add %o0, 8, %o0 add %o0, 8, %o0
brnz %o1, .Loop_gather5 brnz %o1, .Loop_gather5
sub %o1, 1, %o1 sub %o1, 1, %o1
@ -1179,19 +1212,7 @@ $code.=<<___;
nop nop
.type bn_gather5_t4, #function .type bn_gather5_t4, #function
.size bn_gather5_t4, .-bn_gather5_t4 .size bn_gather5_t4, .-bn_gather5_t4
___
$code.=<<___;
#define ONE 0x3f800000
#define NUL 0x00000000
#define NaN 0xffffffff
.align 64
.Lmagic:
.long ONE,NUL,NaN,NaN,NaN,NaN,NUL,ONE
.long NUL,ONE,ONE,NUL,NaN,NaN,NaN,NaN
.long NaN,NaN,NUL,ONE,ONE,NUL,NaN,NaN
.long NaN,NaN,NaN,NaN,NUL,ONE,ONE,NUL
.asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov" .asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
.align 4 .align 4
___ ___

View File

@ -472,7 +472,15 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
wstart=bits-1; /* The top bit of the window */ wstart=bits-1; /* The top bit of the window */
wend=0; /* The bottom bit of the window */ wend=0; /* The bottom bit of the window */
#if 1 /* by Shay Gueron's suggestion */
j = mont->N.top; /* borrow j */
if (bn_wexpand(r,j) == NULL) goto err;
r->d[0] = (0-m->d[0])&BN_MASK2; /* 2^(top*BN_BITS2) - m */
for(i=1;i<j;i++) r->d[i] = (~m->d[i])&BN_MASK2;
r->top = j;
#else
if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err; if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
#endif
for (;;) for (;;)
{ {
if (BN_is_bit_set(p,wstart) == 0) if (BN_is_bit_set(p,wstart) == 0)
@ -524,6 +532,17 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
start=0; start=0;
if (wstart < 0) break; if (wstart < 0) break;
} }
#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_VIS3|SPARCV9_PREFER_FPU))
{
j = mont->N.top; /* borrow j */
val[0]->d[0] = 1; /* borrow val[0] */
for (i=1;i<j;i++) val[0]->d[i] = 0;
val[0]->top = j;
if (!BN_mod_mul_montgomery(rr,r,val[0],mont,ctx)) goto err;
}
else
#endif
if (!BN_from_montgomery(rr,r,mont,ctx)) goto err; if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
ret=1; ret=1;
err: err:
@ -533,6 +552,28 @@ err:
return(ret); return(ret);
} }
#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos)
{
BN_ULONG ret=0;
int wordpos;
wordpos = bitpos/BN_BITS2;
bitpos %= BN_BITS2;
if (wordpos>=0 && wordpos < a->top)
{
ret = a->d[wordpos]&BN_MASK2;
if (bitpos)
{
ret >>= bitpos;
if (++wordpos < a->top)
ret |= a->d[wordpos]<<(BN_BITS2-bitpos);
}
}
return ret&BN_MASK2;
}
#endif
/* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific layout /* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific layout
* so that accessing any of these table values shows the same access pattern as far * so that accessing any of these table values shows the same access pattern as far
@ -673,13 +714,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
tmp.flags = am.flags = BN_FLG_STATIC_DATA; tmp.flags = am.flags = BN_FLG_STATIC_DATA;
/* prepare a^0 in Montgomery domain */ /* prepare a^0 in Montgomery domain */
#if 1 #if 1 /* by Shay Gueron's suggestion */
if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx)) goto err;
#else
tmp.d[0] = (0-m->d[0])&BN_MASK2; /* 2^(top*BN_BITS2) - m */ tmp.d[0] = (0-m->d[0])&BN_MASK2; /* 2^(top*BN_BITS2) - m */
for (i=1;i<top;i++) for (i=1;i<top;i++)
tmp.d[i] = (~m->d[i])&BN_MASK2; tmp.d[i] = (~m->d[i])&BN_MASK2;
tmp.top = top; tmp.top = top;
#else
if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx)) goto err;
#endif #endif
/* prepare a^1 in Montgomery domain */ /* prepare a^1 in Montgomery domain */
@ -694,57 +735,79 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
if (t4) if (t4)
{ {
typedef int (*bn_pwr5_mont_f)(BN_ULONG *tp,const BN_ULONG *np, typedef int (*bn_pwr5_mont_f)(BN_ULONG *tp,const BN_ULONG *np,
const BN_ULONG *n0,const void *table,int power); const BN_ULONG *n0,const void *table,int power,int bits);
int bn_pwr5_mont_t4_8(BN_ULONG *tp,const BN_ULONG *np, int bn_pwr5_mont_t4_8(BN_ULONG *tp,const BN_ULONG *np,
const BN_ULONG *n0,const void *table,int power); const BN_ULONG *n0,const void *table,int power,int bits);
int bn_pwr5_mont_t4_16(BN_ULONG *tp,const BN_ULONG *np, int bn_pwr5_mont_t4_16(BN_ULONG *tp,const BN_ULONG *np,
const BN_ULONG *n0,const void *table,int power); const BN_ULONG *n0,const void *table,int power,int bits);
int bn_pwr5_mont_t4_24(BN_ULONG *tp,const BN_ULONG *np, int bn_pwr5_mont_t4_24(BN_ULONG *tp,const BN_ULONG *np,
const BN_ULONG *n0,const void *table,int power); const BN_ULONG *n0,const void *table,int power,int bits);
int bn_pwr5_mont_t4_32(BN_ULONG *tp,const BN_ULONG *np, int bn_pwr5_mont_t4_32(BN_ULONG *tp,const BN_ULONG *np,
const BN_ULONG *n0,const void *table,int power); const BN_ULONG *n0,const void *table,int power,int bits);
static const bn_pwr5_mont_f funcs[4] = { static const bn_pwr5_mont_f pwr5_funcs[4] = {
bn_pwr5_mont_t4_8, bn_pwr5_mont_t4_16, bn_pwr5_mont_t4_8, bn_pwr5_mont_t4_16,
bn_pwr5_mont_t4_24, bn_pwr5_mont_t4_32 }; bn_pwr5_mont_t4_24, bn_pwr5_mont_t4_32 };
bn_pwr5_mont_f worker = funcs[top/16-1]; bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top/16-1];
typedef int (*bn_mul_mont_f)(BN_ULONG *rp,const BN_ULONG *ap,
const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
int bn_mul_mont_t4_8(BN_ULONG *rp,const BN_ULONG *ap,
const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
int bn_mul_mont_t4_16(BN_ULONG *rp,const BN_ULONG *ap,
const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
int bn_mul_mont_t4_24(BN_ULONG *rp,const BN_ULONG *ap,
const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
int bn_mul_mont_t4_32(BN_ULONG *rp,const BN_ULONG *ap,
const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
static const bn_mul_mont_f mul_funcs[4] = {
bn_mul_mont_t4_8, bn_mul_mont_t4_16,
bn_mul_mont_t4_24, bn_mul_mont_t4_32 };
bn_mul_mont_f mul_worker = mul_funcs[top/16-1];
void bn_mul_mont_vis3(BN_ULONG *rp,const BN_ULONG *ap,
const void *bp,const BN_ULONG *np,
const BN_ULONG *n0,int num);
void bn_mul_mont_t4(BN_ULONG *rp,const BN_ULONG *ap, void bn_mul_mont_t4(BN_ULONG *rp,const BN_ULONG *ap,
const void *bp,const BN_ULONG *np, const void *bp,const BN_ULONG *np,
const BN_ULONG *n0,int num); const BN_ULONG *n0,int num);
void bn_mul_mont_gather5_t4(BN_ULONG *rp,const BN_ULONG *ap, void bn_mul_mont_gather5_t4(BN_ULONG *rp,const BN_ULONG *ap,
const void *table,const BN_ULONG *np, const void *table,const BN_ULONG *np,
const BN_ULONG *n0,int num,int power); const BN_ULONG *n0,int num,int power);
void bn_scatter5_t4(const BN_ULONG *inp,size_t num, void bn_flip_n_scatter5_t4(const BN_ULONG *inp,size_t num,
void *table,size_t power); void *table,size_t power);
void bn_gather5_t4(BN_ULONG *out,size_t num, void bn_gather5_t4(BN_ULONG *out,size_t num,
void *table,size_t power); void *table,size_t power);
void bn_flip_t4(BN_ULONG *dst,BN_ULONG *src,size_t num); void bn_flip_t4(BN_ULONG *dst,BN_ULONG *src,size_t num);
BN_ULONG *np=alloca(top*sizeof(BN_ULONG)), *n0=mont->n0; BN_ULONG *np=mont->N.d, *n0=mont->n0;
int stride = 5*(6-(top/16-1)); /* multiple of 5, but less than 32 */
/* BN_to_montgomery can contaminate words above .top /* BN_to_montgomery can contaminate words above .top
* [in BN_DEBUG[_DEBUG] build]... */ * [in BN_DEBUG[_DEBUG] build]... */
for (i=am.top; i<top; i++) am.d[i]=0; for (i=am.top; i<top; i++) am.d[i]=0;
for (i=tmp.top; i<top; i++) tmp.d[i]=0; for (i=tmp.top; i<top; i++) tmp.d[i]=0;
/* switch to 64-bit domain */ bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,0);
top /= 2; bn_flip_n_scatter5_t4(am.d,top,powerbuf,1);
bn_flip_t4(np,mont->N.d,top); if (!(*mul_worker)(tmp.d,am.d,am.d,np,n0) &&
bn_flip_t4(tmp.d,tmp.d,top); !(*mul_worker)(tmp.d,am.d,am.d,np,n0))
bn_flip_t4(am.d,am.d,top); bn_mul_mont_vis3(tmp.d,am.d,am.d,np,n0,top);
bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,2);
bn_scatter5_t4(tmp.d,top,powerbuf,0);
bn_scatter5_t4(am.d,top,powerbuf,1);
bn_mul_mont_t4(tmp.d,am.d,am.d,np,n0,top);
bn_scatter5_t4(tmp.d,top,powerbuf,2);
for (i=3; i<32; i++) for (i=3; i<32; i++)
{ {
/* Calculate a^i = a^(i-1) * a */ /* Calculate a^i = a^(i-1) * a */
bn_mul_mont_gather5_t4(tmp.d,am.d,powerbuf,np,n0,top,i-1); if (!(*mul_worker)(tmp.d,tmp.d,am.d,np,n0) &&
bn_scatter5_t4(tmp.d,top,powerbuf,i); !(*mul_worker)(tmp.d,tmp.d,am.d,np,n0))
bn_mul_mont_vis3(tmp.d,tmp.d,am.d,np,n0,top);
bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,i);
} }
/* switch to 64-bit domain */
np = alloca(top*sizeof(BN_ULONG));
top /= 2;
bn_flip_t4(np,mont->N.d,top);
bits--; bits--;
for (wvalue=0, i=bits%5; i>=0; i--,bits--) for (wvalue=0, i=bits%5; i>=0; i--,bits--)
wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
@ -755,12 +818,17 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
*/ */
while (bits >= 0) while (bits >= 0)
{ {
for (wvalue=0, i=0; i<5; i++,bits--) if (bits < stride) stride = bits+1;
wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); bits -= stride;
wvalue = bn_get_bits(p,bits+1);
if ((*worker)(tmp.d,np,n0,powerbuf,wvalue)) continue; if ((*pwr5_worker)(tmp.d,np,n0,powerbuf,wvalue,stride)) continue;
/* retry once and fall back */ /* retry once and fall back */
if ((*worker)(tmp.d,np,n0,powerbuf,wvalue)) continue; if ((*pwr5_worker)(tmp.d,np,n0,powerbuf,wvalue,stride)) continue;
bits += stride-5;
wvalue >>= stride-5;
wvalue &= 31;
bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top); bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top); bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top); bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
@ -921,6 +989,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
} }
/* Convert the final result from montgomery to standard format */ /* Convert the final result from montgomery to standard format */
#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_VIS3|SPARCV9_PREFER_FPU))
{
am.d[0] = 1; /* borrow am */
for (i=1;i<top;i++) am.d[i] = 0;
if (!BN_mod_mul_montgomery(rr,&tmp,&am,mont,ctx)) goto err;
}
else
#endif
if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err; if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err;
ret=1; ret=1;
err: err: