Bignum division tune-up. Idea is to move multiplications in front of

loop body and replace 'em with addition/subtraction.
This commit is contained in:
Andy Polyakov 1999-07-30 11:43:43 +00:00
parent a40f6dce87
commit 0dd25e3606
3 changed files with 180 additions and 89 deletions

View File

@ -1,5 +1,5 @@
.rdata .rdata
.asciiz "mips3.s, Version 1.0 (prerelease)" .asciiz "mips3.s, Version 1.0"
.asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" .asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
/* /*
@ -19,19 +19,26 @@
* a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
* module. For updates see http://fy.chalmers.se/~appro/hpe/. * module. For updates see http://fy.chalmers.se/~appro/hpe/.
* *
* The module is designed to work with "new" IRIX ABI(5), namely * The module is designed to work with either of the "new" MIPS ABI(5),
* N32 and N64. But it was tested only with MIPSpro 7.2.x assembler, * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
* i.e. depends on preprocessor options set up by MIPSspro 7.2.x * IRIX 5.x not only because it doesn't support new ABIs but also
* driver. Another neat gadget offered by MIPSpro 7.2.x assembler is * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
* an peep-hole(?) optimization pass. This gave me the opportunity * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
* to make the code looking more regular as all those architecture * cause illegal instruction exception:-(
* dependent(!) instruction rescheduling details were left to the *
* assembler. Cool, huh? Do note that I have no idea if GNU assembler * In addition the code depends on preprocessor flags set up by MIPSpro
* does anything similar nor how GNU C will do with this module. * compiler driver (either as or cc) and therefore (probably?) can't be
* Feedback on the matter is therefore very much appreciated:-) * compiled by the GNU assembler. GNU C driver manages fine though...
* I mean as long as -mmips-as is specified or is the default option,
* because then it simply invokes /usr/bin/as which in turn takes
* perfect care of the preprocessor definitions. Another neat feature
* offered by the MIPSpro assembler is an optimization pass. This gave
* me the opportunity to have the code looking more regular as all those
* architecture dependent instruction rescheduling details were left to
* the assembler. Cool, huh?
* *
* Performance improvement is astonishing! 'apps/openssl speed rsa dsa' * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
* exhibits 3-3.5-3.7 times improvement! * goes way over 3 times faster!
* *
* <appro@fy.chalmers.se> * <appro@fy.chalmers.se>
*/ */
@ -56,8 +63,8 @@
#define MINUS4 v1 #define MINUS4 v1
.align 5
LEAF(bn_mul_add_words) LEAF(bn_mul_add_words)
.align 5
.set noreorder .set noreorder
bgtzl a2,.L_bn_mul_add_words_proceed bgtzl a2,.L_bn_mul_add_words_proceed
ld t0,0(a1) ld t0,0(a1)
@ -185,8 +192,8 @@ LEAF(bn_mul_add_words)
jr ra jr ra
END(bn_mul_add_words) END(bn_mul_add_words)
.align 5
LEAF(bn_mul_words) LEAF(bn_mul_words)
.align 5
.set noreorder .set noreorder
bgtzl a2,.L_bn_mul_words_proceed bgtzl a2,.L_bn_mul_words_proceed
ld t0,0(a1) ld t0,0(a1)
@ -284,8 +291,8 @@ LEAF(bn_mul_words)
jr ra jr ra
END(bn_mul_words) END(bn_mul_words)
.align 5
LEAF(bn_sqr_words) LEAF(bn_sqr_words)
.align 5
.set noreorder .set noreorder
bgtzl a2,.L_bn_sqr_words_proceed bgtzl a2,.L_bn_sqr_words_proceed
ld t0,0(a1) ld t0,0(a1)
@ -371,8 +378,8 @@ LEAF(bn_sqr_words)
jr ra jr ra
END(bn_sqr_words) END(bn_sqr_words)
.align 5
LEAF(bn_add_words) LEAF(bn_add_words)
.align 5
.set noreorder .set noreorder
bgtzl a3,.L_bn_add_words_proceed bgtzl a3,.L_bn_add_words_proceed
ld t0,0(a1) ld t0,0(a1)
@ -471,8 +478,8 @@ LEAF(bn_add_words)
jr ra jr ra
END(bn_add_words) END(bn_add_words)
.align 5
LEAF(bn_sub_words) LEAF(bn_sub_words)
.align 5
.set noreorder .set noreorder
bgtzl a3,.L_bn_sub_words_proceed bgtzl a3,.L_bn_sub_words_proceed
ld t0,0(a1) ld t0,0(a1)
@ -567,24 +574,24 @@ END(bn_sub_words)
#undef MINUS4 #undef MINUS4
.align 5
LEAF(bn_div_words) LEAF(bn_div_words)
.align 5
.set noreorder .set noreorder
bnezl a2,.L_bn_div_words_proceed bnezl a2,.L_bn_div_words_proceed
move t0,zero move v1,zero
jr ra jr ra
li v0,-1 /* I'd rather signal div-by-zero li v0,-1 /* I'd rather signal div-by-zero
* which can be done with 'break 7' */ * which can be done with 'break 7' */
.set reorder
.L_bn_div_words_proceed: .L_bn_div_words_proceed:
bltz a2,.L_bn_div_words_body bltz a2,.L_bn_div_words_body
.set noreorder move t9,v1
dsll a2,1 dsll a2,1
bgtz a2,.-4 bgtz a2,.-4
addu t0,1 addu t9,1
.set reorder .set reorder
negu t1,t0 negu t1,t9
li t2,-1 li t2,-1
dsll t2,t1 dsll t2,t1
and t2,a0 and t2,a0
@ -593,65 +600,135 @@ LEAF(bn_div_words)
bnezl t2,.+8 bnezl t2,.+8
break 6 /* signal overflow */ break 6 /* signal overflow */
.set reorder .set reorder
dsll a0,t0 dsll a0,t9
dsll a1,t0 dsll a1,t9
or a0,AT or a0,AT
#define QT ta0 #define QT ta0
#define DH ta1 #define HH ta1
#define HH ta2 #define DH v1
#define MINUS1 ta3
.L_bn_div_words_body: .L_bn_div_words_body:
dsrl DH,a2,32 dsrl DH,a2,32
li v1,2
sgeu AT,a0,a2 sgeu AT,a0,a2
li MINUS1,-1
.set noreorder .set noreorder
bnezl AT,.+8 bnezl AT,.+8
dsubu a0,a2 dsubu a0,a2
.set reorder .set reorder
.L_bn_div_words_outer_loop: li QT,-1
dsrl HH,a0,32 dsrl HH,a0,32
subu v1,1 dsrl QT,32 /* q=0xffffffff */
dsrl QT,MINUS1,32 /* q=0xffffffff */ beq DH,HH,.L_bn_div_words_skip_div1
beq DH,HH,.L_bn_div_words_inner_loop
ddivu zero,a0,DH ddivu zero,a0,DH
mflo QT mflo QT
.L_bn_div_words_inner_loop: .L_bn_div_words_skip_div1:
dmultu a2,QT dmultu a2,QT
dsll t3,a0,32 dsll t3,a0,32
dsrl AT,a1,32 dsrl AT,a1,32
or t3,AT or t3,AT
mflo t0 mflo t0
mfhi t1 mfhi t1
.L_bn_div_words_inner_loop1:
sltu t2,t3,t0 sltu t2,t3,t0
seq t8,HH,t1 seq t8,HH,t1
sltu AT,HH,t1 sltu AT,HH,t1
and t2,t8 and t2,t8
or AT,t2 or AT,t2
.set noreorder .set noreorder
bnezl AT,.L_bn_div_words_inner_loop beqz AT,.L_bn_div_words_inner_loop1_done
dsubu QT,1 sltu t2,t0,a2
.set reorder .set reorder
dsubu QT,1
dsubu a0,t3,t0 dsubu t0,a2
beqz v1,.L_bn_div_words_outer_loop_done dsubu t1,t2
b .L_bn_div_words_inner_loop1
.L_bn_div_words_inner_loop1_done:
dsll a1,32 dsll a1,32
dsubu a0,t3,t0
dsll v0,QT,32 dsll v0,QT,32
b .L_bn_div_words_outer_loop
.L_bn_div_words_outer_loop_done: li QT,-1
dsrl HH,a0,32
dsrl QT,32 /* q=0xffffffff */
beq DH,HH,.L_bn_div_words_skip_div2
ddivu zero,a0,DH
mflo QT
.L_bn_div_words_skip_div2:
dmultu a2,QT
dsll t3,a0,32
dsrl AT,a1,32
or t3,AT
mflo t0
mfhi t1
.L_bn_div_words_inner_loop2:
sltu t2,t3,t0
seq t8,HH,t1
sltu AT,HH,t1
and t2,t8
or AT,t2
.set noreorder
beqz AT,.L_bn_div_words_inner_loop2_done
sltu t2,t0,a2
.set reorder
dsubu QT,1
dsubu t0,a2
dsubu t1,t2
b .L_bn_div_words_inner_loop2
.L_bn_div_words_inner_loop2_done:
dsubu a0,t3,t0
or v0,QT or v0,QT
move v1,a0 /* v1 contains remainder if one wants it */ dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */
dsrl a2,t9 /* restore a2 */
jr ra jr ra
#undef MINUS1
#undef HH #undef HH
#undef DH #undef DH
#undef QT #undef QT
END(bn_div_words) END(bn_div_words)
.align 5
LEAF(bn_div_3_words)
.set reorder
move a3,a0 /* we know that bn_div_words doesn't
* touch a3, ta2, ta3 and preserves a2
* so that we can save two arguments
* and return address in registers
* instead of stack:-)
*/
ld a0,(a3)
move ta2,a2
move a2,a1
ld a1,-8(a3)
move ta3,ra
move v1,zero
li v0,-1
beq a0,a2,.L_bn_div_3_words_skip_div
jal bn_div_words
move ra,ta3
.L_bn_div_3_words_skip_div:
dmultu ta2,v0
ld t2,-16(a3)
mflo t0
mfhi t1
.L_bn_div_3_words_inner_loop:
sgeu AT,t2,t0
seq t9,t1,v1
sltu t8,t1,v1
and AT,t9
or AT,t8
bnez AT,.L_bn_div_3_words_inner_loop_done
daddu v1,a2
sltu t3,t0,ta2
sltu AT,v1,a2
dsubu v0,1
dsubu t0,ta2
dsubu t1,t3
beqz AT,.L_bn_div_3_words_inner_loop
.L_bn_div_3_words_inner_loop_done:
jr ra
END(bn_div_3_words)
#define a_0 t0 #define a_0 t0
#define a_1 t1 #define a_1 t1
#define a_2 t2 #define a_2 t2
@ -679,20 +756,19 @@ END(bn_div_words)
#define FRAME_SIZE 48 #define FRAME_SIZE 48
.align 5
LEAF(bn_mul_comba8) LEAF(bn_mul_comba8)
.align 5
.set noreorder .set noreorder
PTR_SUB sp,FRAME_SIZE PTR_SUB sp,FRAME_SIZE
.frame sp,64,ra .frame sp,64,ra
.set reorder .set reorder
ld a_0,0(a1) /* If compiled with -mips3 options ld a_0,0(a1) /* If compiled with -mips3 option on
* assembler barks on this line with * R5000 box assembler barks on this
* "shouldn't have mult/div as last * line with "shouldn't have mult/div
* instruction in bb (R10K bug)" * as last instruction in bb (R10K
* warning. If anybody out there has * bug)" warning. If anybody out there
* a clue on what does "bb" mean and * has a clue about how to circumvent
* how to circumvent this do send me * this do send me a note.
* a note.
* <appro@fy.chalmers.se> * <appro@fy.chalmers.se>
*/ */
ld b_0,0(a2) ld b_0,0(a2)
@ -1286,8 +1362,8 @@ LEAF(bn_mul_comba8)
jr ra jr ra
END(bn_mul_comba8) END(bn_mul_comba8)
.align 5
LEAF(bn_mul_comba4) LEAF(bn_mul_comba4)
.align 5
.set reorder .set reorder
ld a_0,0(a1) ld a_0,0(a1)
ld b_0,0(a2) ld b_0,0(a2)
@ -1444,8 +1520,8 @@ END(bn_mul_comba4)
#define a_6 b_2 #define a_6 b_2
#define a_7 b_3 #define a_7 b_3
.align 5
LEAF(bn_sqr_comba8) LEAF(bn_sqr_comba8)
.align 5
.set reorder .set reorder
ld a_0,0(a1) ld a_0,0(a1)
ld a_1,8(a1) ld a_1,8(a1)
@ -1934,8 +2010,8 @@ LEAF(bn_sqr_comba8)
jr ra jr ra
END(bn_sqr_comba8) END(bn_sqr_comba8)
.align 5
LEAF(bn_sqr_comba4) LEAF(bn_sqr_comba4)
.align 5
.set reorder .set reorder
ld a_0,0(a1) ld a_0,0(a1)
ld a_1,8(a1) ld a_1,8(a1)

View File

@ -264,18 +264,20 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
else else
q=h/dh; q=h/dh;
th=q*dh;
tl=dl*q;
for (;;) for (;;)
{ {
t=(h-(th=q*dh)); t=h-th;
tl=BN_MASK2;
if ((t&BN_MASK2h) || if ((t&BN_MASK2h) ||
((tl=dl*q) <= ( ((tl) <= (
(t<<BN_BITS4)| (t<<BN_BITS4)|
((l&BN_MASK2h)>>BN_BITS4)))) ((l&BN_MASK2h)>>BN_BITS4))))
break; break;
q--; q--;
th-=dh;
tl-=dl;
} }
if (tl==BN_MASK2) tl=q*dl;
t=(tl>>BN_BITS4); t=(tl>>BN_BITS4);
tl=(tl<<BN_BITS4)&BN_MASK2h; tl=(tl<<BN_BITS4)&BN_MASK2h;
th+=t; th+=t;

View File

@ -200,56 +200,69 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
for (i=0; i<loop-1; i++) for (i=0; i<loop-1; i++)
{ {
BN_ULONG q,n0,n1; BN_ULONG q,l0;
BN_ULONG l0; #ifdef BN_DIV3W
q=bn_div_3_words(wnump,d0,d1);
#else
BN_ULONG n0,n1,rem;
wnum.d--; wnum.top++;
n0=wnump[0]; n0=wnump[0];
n1=wnump[-1]; n1=wnump[-1];
if (n0 == d0) if (n0 == d0)
q=BN_MASK2; q=BN_MASK2;
else else
#if defined(BN_LLONG) && defined(BN_DIV2W)
q=((((BN_ULLONG)n0)<<BN_BITS2)|n1)/((BN_ULLONG)d0);
#else
q=bn_div_words(n0,n1,d0); q=bn_div_words(n0,n1,d0);
#endif
{ {
#ifdef BN_LLONG #ifdef BN_LLONG
BN_ULLONG t1,t2,rem; BN_ULLONG t2;
t1=((BN_ULLONG)n0<<BN_BITS2)|n1;
/*
* rem doesn't have to be BN_ULLONG. The least we
* know it's less that d0, isn't it?
*/
rem=(n1-q*d0)&BN_MASK2;
t2=(BN_ULLONG)d1*q;
for (;;) for (;;)
{ {
rem=t1-(BN_ULLONG)q*d0; if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
t2=(BN_ULLONG)d1*q;
if ((rem>>BN_BITS2) ||
(t2 <= ((rem<<BN_BITS2)|wnump[-2])))
break; break;
q--; q--;
rem += d0;
if (rem < d0) break; /* don't let rem overflow */
t2 -= d1;
} }
#else #else
BN_ULONG t1l,t1h,t2l,t2h,t3l,t3h,ql,qh,t3t; BN_ULONG t2l,t2h,ql,qh;
t1h=n0;
t1l=n1; /*
* It's more than enough with the only multiplication.
* See the comment above in BN_LLONG section...
*/
rem=(n1-q*d0)&BN_MASK2;
t2l=LBITS(d1); t2h=HBITS(d1);
ql =LBITS(q); qh =HBITS(q);
mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
for (;;) for (;;)
{ {
t2l=LBITS(d1); t2h=HBITS(d1); if ((t2h < rem) ||
ql =LBITS(q); qh =HBITS(q); ((t2h == rem) && (t2l <= wnump[-2])))
mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */ break;
t3t=LBITS(d0); t3h=HBITS(d0);
mul64(t3t,t3h,ql,qh); /* t3=t1-(BN_ULLONG)q*d0; */
t3l=(t1l-t3t)&BN_MASK2;
if (t3l > t1l) t3h++;
t3h=(t1h-t3h)&BN_MASK2;
/*if ((t3>>BN_BITS2) ||
(t2 <= ((t3<<BN_BITS2)+wnump[-2])))
break; */
if (t3h) break;
if (t2h < t3l) break;
if ((t2h == t3l) && (t2l <= wnump[-2])) break;
q--; q--;
rem += d0;
if (rem < d0) break; /* don't let rem overflow */
if (t2l < d1) t2h--; t2l -= d1;
} }
#endif #endif
} }
#endif /* BN_DIV3W */
wnum.d--; wnum.top++;
l0=bn_mul_words(tmp->d,sdiv->d,div_n,q); l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
tmp->d[div_n]=l0; tmp->d[div_n]=l0;
for (j=div_n+1; j>0; j--) for (j=div_n+1; j>0; j--)