From e14d4443a27816b05b044350ad39cd15668c55b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ulf=20M=C3=B6ller?= Date: Thu, 20 May 1999 01:43:07 +0000 Subject: [PATCH] Bignum library bug fix. IRIX 6 passes "make test" now! This also avoids the problems with SC4.2 and unpatched SC5. Submitted by: Andy Polyakov --- CHANGES | 4 ++ Configure | 19 +++--- STATUS | 11 +--- config | 6 +- crypto/bn/bn.h | 6 +- crypto/bn/bn_lib.c | 137 +++++++++++++++++++++++++------------------ crypto/bn/bn_prime.c | 6 +- 7 files changed, 101 insertions(+), 88 deletions(-) diff --git a/CHANGES b/CHANGES index 9f04291d9..6c398c064 100644 --- a/CHANGES +++ b/CHANGES @@ -10,6 +10,10 @@ [23-Dec-1998] down below; but in later versions, these hyphens are gone.] + *) Bignum library bug fix. IRIX 6 passes "make test" now! + This also avoids the problems with SC4.2 and unpatched SC5. + [Andy Polyakov ] + *) New functions sk_num, sk_value and sk_set to replace the previous macros. These are required because of the typesafe stack would otherwise break existing code. If old code used a structure member which used to be STACK diff --git a/Configure b/Configure index 0a4f15e91..2cc831a87 100755 --- a/Configure +++ b/Configure @@ -112,15 +112,12 @@ my %table=( "debug-solaris-usparc-gcc","gcc:-O3 -g -mcpu=ultrasparc -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus-gcc.o::", # DO NOT use /xO[34] on sparc with SC3.0. It is broken, and will not pass the tests -"solaris-sparc-cc","cc:-fast -O -Xa -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_UNROLL BF_PTR:::", +"solaris-sparc-sc3","cc:-fast -O -Xa -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_UNROLL BF_PTR:::", # SC4 is ok, better than gcc even on bn as long as you tell it -xarch=v8 # -fast slows things like DES down quite a lot -# Don't use -xtarget=ultra with SC4.2. It is broken, and will break exptest. -"solaris-sparc-sc4","cc:-xarch=v8 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::", -"solaris-usparc-sc4","cc:-xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o::", -# SC5.0 note: Compiler common patch 107357-01 or later is required! -"solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:", -"solaris64-usparc-sc5","cc:-xtarget=ultra -xarch=v9 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC:-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::::asm/md5-sparcv9.o:", +"solaris-sparc-cc","cc:-xarch=v8 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::", +"solaris-usparc-cc","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:", +"solaris64-usparc-cc","cc:-xtarget=ultra -xarch=v9 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC:-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::::asm/md5-sparcv9.o:", # Sunos configs, assuming sparc for the gcc one. ##"sunos-cc", "cc:-O4 -DNOPROTO -DNOCONST:(unknown)::DES_UNROLL:::", @@ -133,12 +130,12 @@ my %table=( # 3 times faster, use if at all possible. #"irix-gcc","gcc:-O2 -mips2::SIXTY_FOUR_BIT BN_LLONG RC4_INDEX RC4_CHAR:::", "irix-gcc","gcc:-O2 -DTERMIOS -DB_ENDIAN:(unknown)::BN_LLONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR:::", -"irix64-gcc","gcc:-mips3 -O2 -DTERMIOS -DB_ENDIAN:(unknown)::MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR SIXTY_FOUR_BIT:::", "irix-cc", "cc:-O2 -use_readonly_const -DTERMIOS -DB_ENDIAN:(unknown)::BN_LLONG DES_PTR DES_RISC2 DES_UNROLL BF_PTR:::", -"irix64-cc", "cc:-O2 -use_readonly_const -DTERMIOS -DB_ENDIAN:(unknown)::DES_PTR DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT:::", +"irix-mips3-gcc","gcc:-mips3 -O2 -DTERMIOS -DB_ENDIAN:(unknown)::MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR SIXTY_FOUR_BIT:::", +"irix-mips3-cc", "cc:-n32 -mips3 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN:(unknown)::DES_PTR DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT:::", "debug-irix-cc", "cc:-w2 -g -DCRYPTO_MDEBUG -DTERMIOS -DB_ENDIAN:(unknown):::::", -# This is the n64 mode build. -"irix-n64-cc", "cc:-64 -O2 -use_readonly_const -DTERMIOS:(unknown)::DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT:::", +# This is the n64 mode build. (Untested!) +"irix64-mips4-cc", "cc:-64 -mips4 -O2 -use_readonly_const -DTERMIOS:(unknown)::DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT:::", # HPUX 9.X config. # Don't use the bundled cc. It is broken. Use HP ANSI C if possible, or gcc. diff --git a/STATUS b/STATUS index 553232d52..d68ab036c 100644 --- a/STATUS +++ b/STATUS @@ -1,6 +1,6 @@ OpenSSL STATUS Last modified at - ______________ $Date: 1999/05/18 08:52:01 $ + ______________ $Date: 1999/05/20 01:42:57 $ DEVELOPMENT STATE @@ -14,18 +14,9 @@ o OpenSSL 0.9.2b: Released on March 22th, 1999 o OpenSSL 0.9.1c: Released on December 23th, 1998 - [ Proposed new numbering scheme: .[] - 0.9.1c is 0913 - 1.0 is 010000 - 1.0 a is 010001 - 1.8 z is 01081a ] - RELEASE SHOWSTOPPERS o BSD/OS: assembler functions must not have leading underscores - o exptest and rsa_oaep_test fail with irix64-* - (Don Badrak : "Re: Problems to compile openssl - on IRIX 6.2", openssl-users) AVAILABLE PATCHES diff --git a/config b/config index 3ddf4bb46..484f7937f 100755 --- a/config +++ b/config @@ -286,9 +286,9 @@ else if [ "$SYSTEM" = "SunOS" ] then case `cc -V 2>&1` in - *4*) CC=sc4;; - *5*) CC=sc5;; - *) CC=cc;; + *4*) CC=cc;; + *5*) CC=cc;; + *) CC=sc3;; esac fi fi diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h index 65481153c..230a591e4 100644 --- a/crypto/bn/bn.h +++ b/crypto/bn/bn.h @@ -119,11 +119,11 @@ extern "C" { /* This is where the long long data type is 64 bits, but long is 32. * For machines where there are 64bit registers, this is the mode to use. * IRIX, on R4000 and above should use this mode, along with the relevent - * assember code :-). Do NOT define BN_ULLONG. + * assember code :-). Do NOT define BN_LLONG. */ #ifdef SIXTY_FOUR_BIT -#define BN_LLONG -/* #define BN_ULLONG unsigned long long */ +#undef BN_LLONG +#undef BN_ULLONG #define BN_ULONG unsigned long long #define BN_LONG long long #define BN_BITS 128 diff --git a/crypto/bn/bn_lib.c b/crypto/bn/bn_lib.c index bd53124f1..64c9fd9dc 100644 --- a/crypto/bn/bn_lib.c +++ b/crypto/bn/bn_lib.c @@ -150,7 +150,7 @@ char *BN_options(void) int BN_num_bits_word(BN_ULONG l) { - static char bits[256]={ + static const char bits[256]={ 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4, 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, @@ -343,8 +343,9 @@ void BN_CTX_free(BN_CTX *c) BIGNUM *bn_expand2(BIGNUM *b, int words) { - BN_ULONG *A,*B,*a; - int i,j; + BN_ULONG *A,*a; + const BN_ULONG *B; + int i; bn_check_top(b); @@ -362,15 +363,38 @@ BIGNUM *bn_expand2(BIGNUM *b, int words) BNerr(BN_F_BN_EXPAND2,ERR_R_MALLOC_FAILURE); return(NULL); } -memset(A,0x5c,sizeof(BN_ULONG)*(words+1)); #if 1 B=b->d; /* Check if the previous number needs to be copied */ if (B != NULL) { +#if 0 /* This lot is an unrolled loop to copy b->top * BN_ULONGs from B to A */ +/* + * I have nothing against unrolling but it's usually done for + * several reasons, namely: + * - minimize percentage of decision making code, i.e. branches; + * - avoid cache trashing; + * - make it possible to schedule loads earlier; + * Now let's examine the code below. The cornerstone of C is + * "programmer is always right" and that's what we love it for:-) + * For this very reason C compilers have to be paranoid when it + * comes to data aliasing and assume the worst. Yeah, but what + * does it mean in real life? This means that loop body below will + * be compiled to sequence of loads immediately followed by stores + * as compiler assumes the worst, something in A==B+1 style. As a + * result CPU pipeline is going to starve for incoming data. Secondly + * if A and B happen to share same cache line such code is going to + * cause severe cache trashing. Both factors have severe impact on + * performance of modern CPUs and this is the reason why this + * particulare piece of code is #ifdefed away and replaced by more + * "friendly" version found in #else section below. This comment + * also applies to BN_copy function. + * + * + */ for (i=b->top&(~7); i>0; i-=8) { A[0]=B[0]; A[1]=B[1]; A[2]=B[2]; A[3]=B[3]; @@ -407,6 +431,30 @@ memset(A,0x5c,sizeof(BN_ULONG)*(words+1)); */ ; } +#else + for (i=b->top>>2; i>0; i--,A+=4,B+=4) + { + /* + * The fact that the loop is unrolled + * 4-wise is a tribute to Intel. It's + * the one that doesn't have enough + * registers to accomodate more data. + * I'd unroll it 8-wise otherwise:-) + * + * + */ + BN_ULONG a0,a1,a2,a3; + a0=B[0]; a1=B[1]; a2=B[2]; a3=B[3]; + A[0]=a0; A[1]=a1; A[2]=a2; A[3]=a3; + } + switch (b->top&3) + { + case 3: A[2]=B[2]; + case 2: A[1]=B[1]; + case 1: A[0]=B[0]; + case 0: ; /* ultrix cc workaround, see above */ + } +#endif Free(b->d); } @@ -415,22 +463,19 @@ memset(A,0x5c,sizeof(BN_ULONG)*(words+1)); /* Now need to zero any data between b->top and b->max */ - B= &(b->d[b->top]); - j=(b->max - b->top) & ~7; - for (i=0; id[b->top]); + for (i=(b->max - b->top)>>3; i>0; i--,A+=8) { - B[0]=0; B[1]=0; B[2]=0; B[3]=0; - B[4]=0; B[5]=0; B[6]=0; B[7]=0; - B+=8; - } - j=(b->max - b->top) & 7; - for (i=0; imax - b->top)&7; i>0; i--,A++) + A[0]=0; #else - memcpy(a->d,b->d,sizeof(b->d[0])*b->top); + memset(A,0,sizeof(BN_ULONG)*(words+1)); + memcpy(A,b->d,sizeof(b->d[0])*b->top); + b->d=a; + b->max=words; #endif /* memset(&(p[b->max]),0,((words+1)-b->max)*sizeof(BN_ULONG)); */ @@ -454,7 +499,8 @@ BIGNUM *BN_dup(BIGNUM *a) BIGNUM *BN_copy(BIGNUM *a, BIGNUM *b) { int i; - BN_ULONG *A,*B; + BN_ULONG *A; + const BN_ULONG *B; bn_check_top(b); @@ -464,47 +510,18 @@ BIGNUM *BN_copy(BIGNUM *a, BIGNUM *b) #if 1 A=a->d; B=b->d; - for (i=b->top&(~7); i>0; i-=8) + for (i=b->top>>2; i>0; i--,A+=4,B+=4) { - A[0]=B[0]; - A[1]=B[1]; - A[2]=B[2]; - A[3]=B[3]; - A[4]=B[4]; - A[5]=B[5]; - A[6]=B[6]; - A[7]=B[7]; - A+=8; - B+=8; + BN_ULONG a0,a1,a2,a3; + a0=B[0]; a1=B[1]; a2=B[2]; a3=B[3]; + A[0]=a0; A[1]=a1; A[2]=a2; A[3]=a3; } - switch (b->top&7) + switch (b->top&3) { - case 7: - A[6]=B[6]; - case 6: - A[5]=B[5]; - case 5: - A[4]=B[4]; - case 4: - A[3]=B[3]; - case 3: - A[2]=B[2]; - case 2: - A[1]=B[1]; - case 1: - A[0]=B[0]; - case 0: - /* I need the 'case 0' entry for utrix cc. - * If the optimiser is turned on, it does the - * switch table by doing - * a=top&7 - * a--; - * goto jump_table[a]; - * If top is 0, this makes us jump to 0xffffffc which is - * rather bad :-(. - * eric 23-Apr-1998 - */ - ; + case 3: A[2]=B[2]; + case 2: A[1]=B[1]; + case 1: A[0]=B[0]; + case 0: ; /* ultrix cc workaround, see comments in bn_expand2 */ } #else memcpy(a->d,b->d,sizeof(b->d[0])*b->top); @@ -539,6 +556,8 @@ BN_ULONG BN_get_word(BIGNUM *a) #ifndef SIXTY_FOUR_BIT /* the data item > unsigned long */ ret<<=BN_BITS4; /* stops the compiler complaining */ ret<<=BN_BITS4; +#else + ret=0; #endif ret|=a->d[i]; } @@ -563,6 +582,8 @@ int BN_set_word(BIGNUM *a, BN_ULONG w) #ifndef SIXTY_FOUR_BIT /* the data item > unsigned long */ w>>=BN_BITS4; w>>=BN_BITS4; +#else + w=0; #endif a->d[i]=(BN_ULONG)w&BN_MASK2; if (a->d[i] != 0) a->top=i+1; @@ -699,7 +720,7 @@ int BN_set_bit(BIGNUM *a, int n) a->top=i+1; } - a->d[i]|=(1L<d[i]|=(((BN_ULONG)1)<top <= i) return(0); - a->d[i]&=(~(1L<d[i]&=(~(((BN_ULONG)1)<