Support for "multiply high" instruction, see BN_UMULT_HIGH comment in
crypto/bn/bn_lcl.h for further details. It should be noted that for the moment of this writing the code was tested only on Alpha. If compiled with DEC C the C implementation exhibits 12% performance improvement over the crypto/bn/asm/alpha.s (on EV56 box running AlphaLinux). GNU C is (unfortunately) 8% behind the assembler implementation. But it's OpenVMS Alpha users who *may* benefit most as 'apps/openssl speed rsa' exhibits 6 (six) times performance improvement over the original VMS bignum implementation. Where "*may*" means "as soon as code is enabled though #define SIXTY_FOUR_BIT and crypto/bn/asm/vms.mar is skipped."
This commit is contained in:
parent
54a34aecc3
commit
fb81ac5e6b
@ -60,7 +60,7 @@
|
|||||||
#include "cryptlib.h"
|
#include "cryptlib.h"
|
||||||
#include "bn_lcl.h"
|
#include "bn_lcl.h"
|
||||||
|
|
||||||
#ifdef BN_LLONG
|
#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
|
||||||
|
|
||||||
BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
|
BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
|
||||||
{
|
{
|
||||||
@ -69,18 +69,19 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
|
|||||||
bn_check_num(num);
|
bn_check_num(num);
|
||||||
if (num <= 0) return(c1);
|
if (num <= 0) return(c1);
|
||||||
|
|
||||||
for (;;)
|
while (num&~3)
|
||||||
{
|
{
|
||||||
mul_add(rp[0],ap[0],w,c1);
|
mul_add(rp[0],ap[0],w,c1);
|
||||||
if (--num == 0) break;
|
|
||||||
mul_add(rp[1],ap[1],w,c1);
|
mul_add(rp[1],ap[1],w,c1);
|
||||||
if (--num == 0) break;
|
|
||||||
mul_add(rp[2],ap[2],w,c1);
|
mul_add(rp[2],ap[2],w,c1);
|
||||||
if (--num == 0) break;
|
|
||||||
mul_add(rp[3],ap[3],w,c1);
|
mul_add(rp[3],ap[3],w,c1);
|
||||||
if (--num == 0) break;
|
ap+=4; rp+=4; num-=4;
|
||||||
ap+=4;
|
}
|
||||||
rp+=4;
|
if (num)
|
||||||
|
{
|
||||||
|
mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
|
||||||
|
mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
|
||||||
|
mul_add(rp[2],ap[2],w,c1); return c1;
|
||||||
}
|
}
|
||||||
|
|
||||||
return(c1);
|
return(c1);
|
||||||
@ -93,19 +94,19 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
|
|||||||
bn_check_num(num);
|
bn_check_num(num);
|
||||||
if (num <= 0) return(c1);
|
if (num <= 0) return(c1);
|
||||||
|
|
||||||
/* for (;;) */
|
while (num&~3)
|
||||||
while (1) /* circumvent egcs-1.1.2 bug */
|
|
||||||
{
|
{
|
||||||
mul(rp[0],ap[0],w,c1);
|
mul(rp[0],ap[0],w,c1);
|
||||||
if (--num == 0) break;
|
|
||||||
mul(rp[1],ap[1],w,c1);
|
mul(rp[1],ap[1],w,c1);
|
||||||
if (--num == 0) break;
|
|
||||||
mul(rp[2],ap[2],w,c1);
|
mul(rp[2],ap[2],w,c1);
|
||||||
if (--num == 0) break;
|
|
||||||
mul(rp[3],ap[3],w,c1);
|
mul(rp[3],ap[3],w,c1);
|
||||||
if (--num == 0) break;
|
ap+=4; rp+=4; num-=4;
|
||||||
ap+=4;
|
}
|
||||||
rp+=4;
|
if (num)
|
||||||
|
{
|
||||||
|
mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
|
||||||
|
mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
|
||||||
|
mul(rp[2],ap[2],w,c1);
|
||||||
}
|
}
|
||||||
return(c1);
|
return(c1);
|
||||||
}
|
}
|
||||||
@ -114,28 +115,19 @@ void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
|
|||||||
{
|
{
|
||||||
bn_check_num(n);
|
bn_check_num(n);
|
||||||
if (n <= 0) return;
|
if (n <= 0) return;
|
||||||
for (;;)
|
while (n&~3)
|
||||||
{
|
{
|
||||||
BN_ULLONG t;
|
sqr(r[0],r[1],a[0]);
|
||||||
|
sqr(r[2],r[3],a[1]);
|
||||||
t=(BN_ULLONG)(a[0])*(a[0]);
|
sqr(r[4],r[5],a[2]);
|
||||||
r[0]=Lw(t); r[1]=Hw(t);
|
sqr(r[6],r[7],a[3]);
|
||||||
if (--n == 0) break;
|
a+=4; r+=8; n-=4;
|
||||||
|
}
|
||||||
t=(BN_ULLONG)(a[1])*(a[1]);
|
if (n)
|
||||||
r[2]=Lw(t); r[3]=Hw(t);
|
{
|
||||||
if (--n == 0) break;
|
sqr(r[0],r[1],a[0]); if (--n == 0) return;
|
||||||
|
sqr(r[2],r[3],a[1]); if (--n == 0) return;
|
||||||
t=(BN_ULLONG)(a[2])*(a[2]);
|
sqr(r[4],r[5],a[2]);
|
||||||
r[4]=Lw(t); r[5]=Hw(t);
|
|
||||||
if (--n == 0) break;
|
|
||||||
|
|
||||||
t=(BN_ULLONG)(a[3])*(a[3]);
|
|
||||||
r[6]=Lw(t); r[7]=Hw(t);
|
|
||||||
if (--n == 0) break;
|
|
||||||
|
|
||||||
a+=4;
|
|
||||||
r+=8;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -460,6 +452,38 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
|
|||||||
|
|
||||||
#define sqr_add_c2(a,i,j,c0,c1,c2) \
|
#define sqr_add_c2(a,i,j,c0,c1,c2) \
|
||||||
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
|
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
|
||||||
|
|
||||||
|
#elif defined(BN_UMULT_HIGH)
|
||||||
|
|
||||||
|
#define mul_add_c(a,b,c0,c1,c2) { \
|
||||||
|
BN_ULONG ta=(a),tb=(b); \
|
||||||
|
t1 = ta * tb; \
|
||||||
|
t2 = BN_UMULT_HIGH(ta,tb); \
|
||||||
|
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||||
|
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define mul_add_c2(a,b,c0,c1,c2) { \
|
||||||
|
BN_ULONG ta=(a),tb=(b),t0; \
|
||||||
|
t1 = BN_UMULT_HIGH(ta,tb); \
|
||||||
|
t0 = ta * tb; \
|
||||||
|
t2 = t1+t1; c2 += (t2<t1)?1:0; \
|
||||||
|
t1 = t0+t0; t2 += (t1<t0)?1:0; \
|
||||||
|
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||||
|
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define sqr_add_c(a,i,c0,c1,c2) { \
|
||||||
|
BN_ULONG ta=(a)[i]; \
|
||||||
|
t1 = ta * ta; \
|
||||||
|
t2 = BN_UMULT_HIGH(ta,ta); \
|
||||||
|
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||||
|
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define sqr_add_c2(a,i,j,c0,c1,c2) \
|
||||||
|
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#define mul_add_c(a,b,c0,c1,c2) \
|
#define mul_add_c(a,b,c0,c1,c2) \
|
||||||
t1=LBITS(a); t2=HBITS(a); \
|
t1=LBITS(a); t2=HBITS(a); \
|
||||||
|
@ -280,9 +280,14 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
|
|||||||
*/
|
*/
|
||||||
rem=(n1-q*d0)&BN_MASK2;
|
rem=(n1-q*d0)&BN_MASK2;
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef BN_UMULT_HIGH
|
||||||
|
t2l = d1 * q;
|
||||||
|
t2h = BN_UMULT_HIGH(d1,q);
|
||||||
|
#else
|
||||||
t2l=LBITS(d1); t2h=HBITS(d1);
|
t2l=LBITS(d1); t2h=HBITS(d1);
|
||||||
ql =LBITS(q); qh =HBITS(q);
|
ql =LBITS(q); qh =HBITS(q);
|
||||||
mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
|
mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
|
||||||
|
#endif
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
|
@ -86,6 +86,54 @@ extern "C" {
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if !defined(NO_ASM) && !defined(PEDANTIC)
|
||||||
|
/*
|
||||||
|
* BN_UMULT_HIGH section.
|
||||||
|
*
|
||||||
|
* No, I'm not trying to overwhelm you when stating that the
|
||||||
|
* product of N-bit numbers is 2*N bits wide:-) No, I don't expect
|
||||||
|
* you to be impressed when I say that if the compiler doesn't
|
||||||
|
* support 2*N integer type, then you have to replace every N*N
|
||||||
|
* multiplication with 4 (N/2)*(N/2) accompanied by some shifts
|
||||||
|
* and additions which unavoidably results in severe performance
|
||||||
|
* penalties. Of course provided that the hardware is capable of
|
||||||
|
* producing 2*N result... That's when you normally start
|
||||||
|
* considering assembler implementation. However! It should be
|
||||||
|
* pointed out that some CPUs (most notably Alpha, PowerPC and
|
||||||
|
* upcoming IA-64 family:-) provide *separate* instruction
|
||||||
|
* calculating the upper half of the product placing the result
|
||||||
|
* into a general purpose register. Now *if* the compiler supports
|
||||||
|
* inline assembler, then it's not impossible to implement the
|
||||||
|
* "bignum" routines (and have the compiler optimize 'em)
|
||||||
|
* exhibiting "native" performance in C. That's what BN_UMULT_HIGH
|
||||||
|
* macro is about:-)
|
||||||
|
*
|
||||||
|
* <appro@fy.chalmers.se>
|
||||||
|
*/
|
||||||
|
# if defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
|
||||||
|
# if defined(__DECC)
|
||||||
|
# include <c_asm.h>
|
||||||
|
# define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
|
||||||
|
# elif defined(__GNUC__)
|
||||||
|
# define BN_UMULT_HIGH(a,b) ({ \
|
||||||
|
register BN_ULONG ret; \
|
||||||
|
asm ("umulh %1,%2,%0" \
|
||||||
|
: "=r"(ret) \
|
||||||
|
: "r"(a), "r"(b)); \
|
||||||
|
ret; })
|
||||||
|
# endif /* compiler */
|
||||||
|
# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
|
||||||
|
# if defined(__GNUC__)
|
||||||
|
# define BN_UMULT_HIGH(a,b) ({ \
|
||||||
|
register BN_ULONG ret; \
|
||||||
|
asm ("mulhdu %0,%1,%2" \
|
||||||
|
: "=r"(ret) \
|
||||||
|
: "r"(a), "r"(b)); \
|
||||||
|
ret; })
|
||||||
|
# endif /* compiler */
|
||||||
|
# endif /* cpu */
|
||||||
|
#endif /* NO_ASM */
|
||||||
|
|
||||||
/*************************************************************
|
/*************************************************************
|
||||||
* Using the long long type
|
* Using the long long type
|
||||||
*/
|
*/
|
||||||
@ -151,6 +199,43 @@ extern "C" {
|
|||||||
(c)= Hw(t); \
|
(c)= Hw(t); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define sqr(r0,r1,a) { \
|
||||||
|
BN_ULLONG t; \
|
||||||
|
t=(BN_ULLONG)(a)*(a); \
|
||||||
|
(r0)=Lw(t); \
|
||||||
|
(r1)=Hw(t); ]
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(BN_UMULT_HIGH)
|
||||||
|
#define mul_add(r,a,w,c) { \
|
||||||
|
BN_ULONG high,low,ret,tmp=(a); \
|
||||||
|
ret = (r); \
|
||||||
|
high= BN_UMULT_HIGH(w,tmp); \
|
||||||
|
ret += (c); \
|
||||||
|
low = (w) * tmp; \
|
||||||
|
(c) = (ret<(c))?1:0; \
|
||||||
|
(c) += high; \
|
||||||
|
ret += low; \
|
||||||
|
(c) += (ret<low)?1:0; \
|
||||||
|
(r) = ret; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define mul(r,a,w,c) { \
|
||||||
|
BN_ULONG high,low,ret,ta=(a); \
|
||||||
|
low = (w) * ta; \
|
||||||
|
high= BN_UMULT_HIGH(w,ta); \
|
||||||
|
ret = low + (c); \
|
||||||
|
(c) = high; \
|
||||||
|
(c) += (ret<low)?1:0; \
|
||||||
|
(r) = ret; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define sqr(r0,r1,a) { \
|
||||||
|
BN_ULONG tmp=(a); \
|
||||||
|
(r0) = tmp * tmp; \
|
||||||
|
(r1) = BN_UMULT_HIGH(tmp,tmp); \
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
/*************************************************************
|
/*************************************************************
|
||||||
* No long long type
|
* No long long type
|
||||||
|
Loading…
x
Reference in New Issue
Block a user