From 59c7029862869306dfd7ca01488bdaf7ed2f6816 Mon Sep 17 00:00:00 2001 From: "Dr. Stephen Henson" Date: Fri, 26 Nov 2004 01:04:55 +0000 Subject: [PATCH 01/12] Typo. --- doc/apps/asn1parse.pod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/apps/asn1parse.pod b/doc/apps/asn1parse.pod index e3462aabf..542d96906 100644 --- a/doc/apps/asn1parse.pod +++ b/doc/apps/asn1parse.pod @@ -165,7 +165,7 @@ Example config file: =head1 BUGS -There should be options to change the format of input lines. The output of some +There should be options to change the format of output lines. The output of some ASN.1 types is not well handled (if at all). =cut From d675c74d14f9e7f7046ee7b383e41868782ab5bd Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Fri, 26 Nov 2004 15:07:50 +0000 Subject: [PATCH 02/12] RC4 IA-64 assembler implementation. --- crypto/rc4/asm/rc4-ia64.S | 148 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 crypto/rc4/asm/rc4-ia64.S diff --git a/crypto/rc4/asm/rc4-ia64.S b/crypto/rc4/asm/rc4-ia64.S new file mode 100644 index 000000000..4af7fba7b --- /dev/null +++ b/crypto/rc4/asm/rc4-ia64.S @@ -0,0 +1,148 @@ +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. +// +// Rights for redistribution and usage in source and binary forms are +// granted according to the OpenSSL license. Warranty of any kind is +// disclaimed. +// ==================================================================== + +.ident "rc4-ia64.S, Version 1.0" +.ident "IA-64 ISA artwork by Andy Polyakov " + +// What's wrong with compiler generated code? Because of the nature of +// C language, compiler doesn't [dare to] reorder load and stores. But +// being memory-bound, RC4 should benefit from reorder [on in-order- +// execution core such as IA-64]. But what can we reorder? At the very +// least we can safely reorder references to key schedule in respect +// to input and output streams. Secondly, less obvious, it's possible +// to pull up some references to elements of the key schedule itself. +// Fact is that such prior loads are not safe only for "degenerated" +// key schedule, when all elements equal to the same value, which is +// never the case [key schedule setup routine makes sure it's not]. +// Furthermore. In order to compress loop body to the minimum, I chose +// to deploy deposit instruction, which substitutes for the whole +// key->data+((x&255)<data[0]))). This unfortunately +// requires key->data to be aligned at sizeof(key->data) boundary. +// This is why you'll find "RC4_INT pad[512-256-2];" addenum to RC4_KEY +// and "d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1));" in +// rc4_skey.c [and rc4_enc.c, where it's retained for debugging +// purposes]. Throughput is ~210MBps on 900MHz CPU, which is is >3x +// faster than gcc generated code and +30% - if compared to HP-UX C. +// Unrolling loop below should give >30% on top of that... + +.text +.explicit + +#if defined(_HPUX_SOURCE) && !defined(_LP64) +# define ADDP addp4 +#else +# define ADDP add +#endif + +#define SZ 4 // this is set to sizeof(RC4_INT) +// SZ==4 seems to be optimal. At least SZ==8 is not any faster, not for +// assembler implementation, while SZ==1 code is ~30% slower. +#if SZ==1 // RC4_INT is unsigned char +# define LDKEY ld1 +# define STKEY st1 +# define OFF 0 +#elif SZ==4 // RC4_INT is unsigned int +# define LDKEY ld4 +# define STKEY st4 +# define OFF 2 +#elif SZ==8 // RC4_INT is unsigned long +# define LDKEY ld8 +# define STKEY st8 +# define OFF 3 +#endif + +out=r8; // [expanded] output pointer +inp=r9; // [expanded] output pointer +prsave=r10; +key=r28; // [expanded] pointer to RC4_KEY +ksch=r29; // (key->data+255)[&~(sizeof(key->data)-1)] +xx=r30; +yy=r31; + +// void RC4(RC4_KEY *key,size_t len,const void *inp,void *out); +.global RC4# +.proc RC4# +.align 32 +.skip 16 +RC4: + .prologue + .fframe 0 + .save ar.pfs,r2 + .save ar.lc,r3 + .save pr,prsave +{ .mii; alloc r2=ar.pfs,4,12,0,16 + mov prsave=pr + ADDP key=0,in0 };; +{ .mib; cmp.eq p6,p0=0,in1 // len==0? + mov r3=ar.lc +(p6) br.ret.spnt.many b0 };; // emergency exit + + .body + .rotr dat[4],key_x[4],tx[2],rnd[2],key_y[2],ty[1]; + +{ .mib; LDKEY xx=[key],SZ // load key->x + add in1=-1,in1 // adjust len for loop counter + nop.b 0 } +{ .mib; ADDP inp=0,in2 + ADDP out=0,in3 + brp.loop.imp .Ltop,.Lexit-16 };; +{ .mmi; LDKEY yy=[key] // load key->y + add ksch=(255+1)*SZ,key // as ksch will be used with + // deposit instruction only, + // I don't have to &~255... + mov ar.lc=in1 } +{ .mmi; nop.m 0 + add xx=1,xx + mov pr.rot=1<<16 };; +{ .mii; nop.m 0 + dep key_x[1]=xx,ksch,OFF,8 + mov ar.ec=3 };; // note that epilogue counter + // is off by 1. I compensate + // for this at exit... +.Ltop: +// The loop is scheduled for 3*(n+2) spin-rate on Itanium 2, which +// theoretically gives asymptotic performance of clock frequency +// divided by 3 bytes per seconds, or 500MBps on 1.5GHz CPU. Measured +// performance however is distinctly lower than 1/4:-( The culplrit +// seems to be *(out++)=dat, which inadvertently splits the bundle, +// even though there is M-unit available... Unrolling is due... +// Unrolled loop should collect output with variable shift instruction +// in order to avoid starvation for integer shifter... Only output +// pointer has to be aligned... It should be possible to get pretty +// close to theoretical peak... +{ .mmi; (p16) LDKEY tx[0]=[key_x[1]] // tx=key[xx] + (p17) LDKEY ty[0]=[key_y[1]] // ty=key[yy] + (p18) dep rnd[1]=rnd[1],ksch,OFF,8} // &key[(tx+ty)&255] +{ .mmi; (p19) st1 [out]=dat[3],1 // *(out++)=dat + (p16) add xx=1,xx // x++ + (p0) nop.i 0 };; +{ .mmi; (p18) LDKEY rnd[1]=[rnd[1]] // rnd=key[(tx+ty)&255] + (p16) ld1 dat[0]=[inp],1 // dat=*(inp++) + (p16) dep key_x[0]=xx,ksch,OFF,8 } // &key[xx&255] +{ .mmi; (p0) nop.m 0 + (p16) add yy=yy,tx[0] // y+=tx + (p0) nop.i 0 };; +{ .mmi; (p17) STKEY [key_y[1]]=tx[1] // key[yy]=tx + (p17) STKEY [key_x[2]]=ty[0] // key[xx]=ty + (p16) dep key_y[0]=yy,ksch,OFF,8 } // &key[yy&255] +{ .mmb; (p17) add rnd[0]=tx[1],ty[0] // tx+=ty + (p18) xor dat[2]=dat[2],rnd[1] // dat^=rnd + br.ctop.sptk .Ltop };; +.Lexit: +{ .mib; STKEY [key]=yy,-SZ // save key->y + mov pr=prsave,0x1ffff + nop.b 0 } +{ .mib; st1 [out]=dat[3],1 // compensate for truncated + // epilogue counter + add xx=-1,xx + nop.b 0 };; +{ .mib; STKEY [key]=xx // save key->x + mov ar.lc=r3 + br.ret.sptk.many b0 };; +.endp RC4# From bc3e7fabe7f93084d8d93f55bebe20057a0b6970 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Fri, 26 Nov 2004 15:12:17 +0000 Subject: [PATCH 03/12] Engage RC4 IA-64 assembler module. --- Configure | 17 ++++++++++------- TABLE | 32 ++++++++++++++++---------------- crypto/rc4/Makefile.ssl | 3 +++ crypto/rc4/rc4.h | 4 ++++ crypto/rc4/rc4_enc.c | 4 ++++ crypto/rc4/rc4_skey.c | 4 ++++ crypto/sha/Makefile.ssl | 4 ++-- 7 files changed, 43 insertions(+), 25 deletions(-) diff --git a/Configure b/Configure index cce2af2b8..ac3d86f77 100755 --- a/Configure +++ b/Configure @@ -117,6 +117,9 @@ my $bits2="SIXTY_FOUR_BIT "; my $x86_elf_asm="x86cpuid-elf.o:asm/bn86-elf.o asm/co86-elf.o:asm/dx86-elf.o asm/yx86-elf.o::asm/bx86-elf.o:asm/mx86-elf.o:asm/sx86-elf.o asm/s512sse2-elf.o:asm/cx86-elf.o:asm/rx86-elf.o:asm/rm86-elf.o:asm/r586-elf.o"; my $x86_coff_asm="x86cpuid-cof.o:asm/bn86-cof.o asm/co86-cof.o:asm/dx86-cof.o asm/yx86-cof.o::asm/bx86-cof.o:asm/mx86-cof.o:asm/sx86-cof.o asm/s512sse2-cof.o:asm/cx86-cof.o:asm/rx86-cof.o:asm/rm86-cof.o:asm/r586-cof.o"; my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o::bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o"; + +my $ia64_asm=":asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o::asm/rc4-ia64.o::"; + my $no_asm="::::::::::"; # -DB_ENDIAN slows things down on a sparc for md5, but helps sha1. @@ -258,13 +261,13 @@ my %table=( "hpux64-parisc2-cc","cc:+DD64 +O3 +Optrs_strongly_typed -Ae +ESlit -DB_ENDIAN -DMD32_XARRAY -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::asm/pa-risc2W.o::::::::::dlfcn:hpux-shared:+Z:+DD64 -b:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", # HP/UX IA-64 targets -"hpux-ia64-cc","cc:-Ae +DD32 +O2 +Olit=all -z -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:hpux-shared:+Z:+DD32 -b:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"hpux-ia64-cc","cc:-Ae +DD32 +O2 +Olit=all -z -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT MD2_CHAR RC4_INDEX DES_UNROLL DES_RISC1 DES_INT:${ia64_asm}:dlfcn:hpux-shared:+Z:+DD32 -b:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", # Frank Geurts has patiently assisted with # with debugging of the following config. -"hpux64-ia64-cc","cc:-Ae +DD64 +O3 +Olit=all -z -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:hpux-shared:+Z:+DD64 -b:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"hpux64-ia64-cc","cc:-Ae +DD64 +O3 +Olit=all -z -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX DES_UNROLL DES_RISC1 DES_INT:${ia64_asm}:dlfcn:hpux-shared:+Z:+DD64 -b:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", # GCC builds... -"hpux-ia64-gcc","gcc:-O3 -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:hpux-shared:-fpic:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"hpux64-ia64-gcc","gcc:-mlp64 -O3 -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:hpux-shared:-fpic:-mlp64 -shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"hpux-ia64-gcc","gcc:-O3 -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT MD2_CHAR RC4_INDEX DES_UNROLL DES_RISC1 DES_INT::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o::asm/rc4-ia64.o:::dlfcn:hpux-shared:-fpic:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"hpux64-ia64-gcc","gcc:-mlp64 -O3 -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX DES_UNROLL DES_RISC1 DES_INT:${ia64_asm}:dlfcn:hpux-shared:-fpic:-mlp64 -shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", # Legacy HPUX 9.X configs... "hpux-cc", "cc:-DB_ENDIAN -DBN_DIV2W -DMD32_XARRAY -Ae +ESlit +O2 -z::(unknown)::-Wl,+s -ldld:DES_PTR DES_UNROLL DES_RISC1:${no_asm}:dl:hpux-shared:+Z:-b:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", @@ -316,8 +319,8 @@ my %table=( "linux-m68k", "gcc:-DB_ENDIAN -DTERMIO -O2 -fomit-frame-pointer -Wall::-D_REENTRANT:::BN_LLONG::", "linux-s390", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-s390x", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL:amd64cpuid.o:asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-elf-arm","gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-parisc", "gcc:-DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DBN_DIV2W::-D_REENTRANT:::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::", @@ -356,7 +359,7 @@ my %table=( "FreeBSD-elf", "gcc:-DTERMIOS -DL_ENDIAN -fomit-frame-pointer -O3 -m486 -Wall::-pthread -D_REENTRANT -D_THREAD_SAFE -D_THREADSAFE:::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "FreeBSD-sparc64","gcc:-DB_ENDIAN -DTERMIOS -O3 -fomit-frame-pointer::-pthread -D_REENTRANT -D_THREAD_SAFE -D_THREADSAFE:::SIXTY_FOUR_BIT_LONG DES_INT DES_PTR DES_RISC2 BF_PTR:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"FreeBSD-ia64", "gcc:-DL_ENDIAN -DTERMIOS -O -fomit-frame-pointer::(unknown):::SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::::::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"FreeBSD-ia64", "gcc:-DL_ENDIAN -DTERMIOS -O -fomit-frame-pointer::(unknown):::SIXTY_FOUR_BIT_LONG RC4_CHUNK:${ia64_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "FreeBSD-alpha","gcc:-DTERMIOS -O::(unknown):::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC2:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "FreeBSD", "gcc:-DTERMIOS -DL_ENDIAN -fomit-frame-pointer -O3 -m486 -Wall::(unknown):::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_out_asm}", diff --git a/TABLE b/TABLE index e8a258608..04c340a4d 100644 --- a/TABLE +++ b/TABLE @@ -250,16 +250,16 @@ $unistd = $thread_cflag = (unknown) $sys_id = $lflags = -$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR +$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK $cpuid_obj = $bn_obj = asm/ia64.o $des_obj = -$aes_obj = +$aes_obj = asm/aes-ia64.o $bf_obj = $md5_obj = -$sha1_obj = +$sha1_obj = asm/sha256-ia64.o asm/sha512-ia64.o $cast_obj = -$rc4_obj = +$rc4_obj = asm/rc4-ia64.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -2410,7 +2410,7 @@ $unistd = $thread_cflag = $sys_id = $lflags = -ldl -$bn_ops = SIXTY_FOUR_BIT MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT +$bn_ops = SIXTY_FOUR_BIT MD2_CHAR RC4_INDEX DES_UNROLL DES_RISC1 DES_INT $cpuid_obj = $bn_obj = asm/ia64.o $des_obj = @@ -2419,7 +2419,7 @@ $bf_obj = $md5_obj = $sha1_obj = asm/sha256-ia64.o asm/sha512-ia64.o $cast_obj = -$rc4_obj = +$rc4_obj = asm/rc4-ia64.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -2437,7 +2437,7 @@ $unistd = $thread_cflag = $sys_id = $lflags = -ldl -$bn_ops = SIXTY_FOUR_BIT MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT +$bn_ops = SIXTY_FOUR_BIT MD2_CHAR RC4_INDEX DES_UNROLL DES_RISC1 DES_INT $cpuid_obj = $bn_obj = asm/ia64.o $des_obj = @@ -2446,7 +2446,7 @@ $bf_obj = $md5_obj = $sha1_obj = asm/sha256-ia64.o asm/sha512-ia64.o $cast_obj = -$rc4_obj = +$rc4_obj = asm/rc4-ia64.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -2626,7 +2626,7 @@ $unistd = $thread_cflag = $sys_id = $lflags = -ldl -$bn_ops = SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT +$bn_ops = SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX DES_UNROLL DES_RISC1 DES_INT $cpuid_obj = $bn_obj = asm/ia64.o $des_obj = @@ -2635,7 +2635,7 @@ $bf_obj = $md5_obj = $sha1_obj = asm/sha256-ia64.o asm/sha512-ia64.o $cast_obj = -$rc4_obj = +$rc4_obj = asm/rc4-ia64.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -2653,7 +2653,7 @@ $unistd = $thread_cflag = $sys_id = $lflags = -ldl -$bn_ops = SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT +$bn_ops = SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX DES_UNROLL DES_RISC1 DES_INT $cpuid_obj = $bn_obj = asm/ia64.o $des_obj = @@ -2662,7 +2662,7 @@ $bf_obj = $md5_obj = $sha1_obj = asm/sha256-ia64.o asm/sha512-ia64.o $cast_obj = -$rc4_obj = +$rc4_obj = asm/rc4-ia64.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -3139,7 +3139,7 @@ $unistd = $thread_cflag = -D_REENTRANT $sys_id = $lflags = -ldl -$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR +$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK $cpuid_obj = $bn_obj = asm/ia64.o $des_obj = @@ -3148,7 +3148,7 @@ $bf_obj = $md5_obj = $sha1_obj = asm/sha256-ia64.o asm/sha512-ia64.o $cast_obj = -$rc4_obj = +$rc4_obj = asm/rc4-ia64.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -3166,7 +3166,7 @@ $unistd = $thread_cflag = -D_REENTRANT $sys_id = $lflags = -ldl -$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR +$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK $cpuid_obj = $bn_obj = asm/ia64.o $des_obj = @@ -3175,7 +3175,7 @@ $bf_obj = $md5_obj = $sha1_obj = asm/sha256-ia64.o asm/sha512-ia64.o $cast_obj = -$rc4_obj = +$rc4_obj = asm/rc4-ia64.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn diff --git a/crypto/rc4/Makefile.ssl b/crypto/rc4/Makefile.ssl index 5878aecf8..08e411403 100644 --- a/crypto/rc4/Makefile.ssl +++ b/crypto/rc4/Makefile.ssl @@ -65,6 +65,9 @@ rx86-out.s: asm/rc4-586.pl ../perlasm/x86asm.pl asm/rc4-amd64.s: asm/rc4-amd64.pl; $(PERL) $< $@ +asm/rc4-ia64.s: asm/rc4-ia64.S + $(CC) $(CFLAGS) -E asm/rc4-ia64.S > $@ + files: $(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO diff --git a/crypto/rc4/rc4.h b/crypto/rc4/rc4.h index 7aec04fe9..c24a5b128 100644 --- a/crypto/rc4/rc4.h +++ b/crypto/rc4/rc4.h @@ -72,6 +72,10 @@ typedef struct rc4_key_st { RC4_INT x,y; RC4_INT data[256]; +#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) + /* see crypto/rc4/asm/rc4-ia64.S for further details... */ + RC4_INT pad[512-256-2]; +#endif } RC4_KEY; diff --git a/crypto/rc4/rc4_enc.c b/crypto/rc4/rc4_enc.c index d5f18a3a7..81a97ea3b 100644 --- a/crypto/rc4/rc4_enc.c +++ b/crypto/rc4/rc4_enc.c @@ -77,6 +77,10 @@ void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata, x=key->x; y=key->y; d=key->data; +#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) + /* see crypto/rc4/asm/rc4-ia64.S for further details... */ + d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1)); +#endif #if defined(RC4_CHUNK) /* diff --git a/crypto/rc4/rc4_skey.c b/crypto/rc4/rc4_skey.c index 781ff2d8b..083b53dfb 100644 --- a/crypto/rc4/rc4_skey.c +++ b/crypto/rc4/rc4_skey.c @@ -93,6 +93,10 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data) unsigned int i; d= &(key->data[0]); +#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) + /* see crypto/rc4/asm/rc4-ia64.S for further details... */ + d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1)); +#endif key->x = 0; key->y = 0; id1=id2=0; diff --git a/crypto/sha/Makefile.ssl b/crypto/sha/Makefile.ssl index b653052bb..8d85dd442 100644 --- a/crypto/sha/Makefile.ssl +++ b/crypto/sha/Makefile.ssl @@ -65,9 +65,9 @@ s512sse2-out.s: asm/sha512-sse2.pl ../perlasm/x86asm.pl (cd asm; $(PERL) sha512-sse2.pl a.out $(CFLAGS) $(PROCESSOR) > ../$@) asm/sha256-ia64.s: asm/sha512-ia64.pl - (cd asm; $(PERL) sha512-ia64.pl $@ $(CFLAGS)) + (cd asm; $(PERL) sha512-ia64.pl ../$@ $(CFLAGS)) asm/sha512-ia64.s: asm/sha512-ia64.pl - (cd asm; $(PERL) sha512-ia64.pl $@ $(CFLAGS)) + (cd asm; $(PERL) sha512-ia64.pl ../$@ $(CFLAGS)) files: $(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO From ea681ba87228c3b26f143f9c1aca07b114dbfd40 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Fri, 26 Nov 2004 15:26:09 +0000 Subject: [PATCH 04/12] Summarize recent RC4 tune-ups. --- CHANGES | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES b/CHANGES index 28bd44aeb..1227d35e2 100644 --- a/CHANGES +++ b/CHANGES @@ -4,6 +4,10 @@ Changes between 0.9.7e and 0.9.8 [xx XXX xxxx] + *) RC4 performance overhaul on modern architectures/implementations, such + as Intel P4, IA-64 and AMD64. + [Andy Polyakov] + *) New utility extract-section.pl. This can be used specify an alternative section number in a pod file instead of having to treat each file as a separate case in Makefile. This can be done by adding two lines to the From 914c2a28c05797dc44fb3f498e6e12e5bc0db2b3 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sat, 27 Nov 2004 15:14:58 +0000 Subject: [PATCH 05/12] perlasm/x86[ms|nasm] update to accomodate updated RC4 assembler module. --- crypto/perlasm/x86ms.pl | 9 +++++---- crypto/perlasm/x86nasm.pl | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/crypto/perlasm/x86ms.pl b/crypto/perlasm/x86ms.pl index 376476058..4f4a6fea9 100644 --- a/crypto/perlasm/x86ms.pl +++ b/crypto/perlasm/x86ms.pl @@ -174,6 +174,7 @@ sub main'leave { &out0("leave"); } sub main'cpuid { &out0("DW\t0A20Fh"); } sub main'rdtsc { &out0("DW\t0310Fh"); } sub main'halt { &out0("hlt"); } +sub main'movz { &out2("movzx",@_); } # SSE2 sub main'emms { &out0("emms"); } @@ -255,7 +256,7 @@ sub main'function_begin push(@labels,$func); local($tmp)=<<"EOF"; -_TEXT SEGMENT +_TEXT\$ SEGMENT PARA PUBLIC _$func $extra _$func PROC NEAR @@ -273,7 +274,7 @@ sub main'function_begin_B local($func,$extra)=@_; local($tmp)=<<"EOF"; -_TEXT SEGMENT +_TEXT\$ SEGMENT PARA PUBLIC _$func $extra _$func PROC NEAR @@ -293,7 +294,7 @@ sub main'function_end pop ebp ret _$func ENDP -_TEXT ENDS +_TEXT\$ ENDS EOF push(@out,$tmp); $stack=0; @@ -306,7 +307,7 @@ sub main'function_end_B local($tmp)=<<"EOF"; _$func ENDP -_TEXT ENDS +_TEXT\$ ENDS EOF push(@out,$tmp); $stack=0; diff --git a/crypto/perlasm/x86nasm.pl b/crypto/perlasm/x86nasm.pl index 27080a0f8..965bff1bd 100644 --- a/crypto/perlasm/x86nasm.pl +++ b/crypto/perlasm/x86nasm.pl @@ -192,6 +192,7 @@ sub main'leave { &out0("leave"); } sub main'cpuid { &out0("cpuid"); } sub main'rdtsc { &out0("rdtsc"); } sub main'halt { &out0("hlt"); } +sub main'movz { &out2("movzx",@_); } # SSE2 sub main'emms { &out0("emms"); } From 30b415b0765b465e71262d051b7b16b604a855be Mon Sep 17 00:00:00 2001 From: Richard Levitte Date: Mon, 29 Nov 2004 11:28:08 +0000 Subject: [PATCH 06/12] Make an explicit check during certificate validation to see that the CA setting in each certificate on the chain is correct. As a side- effect always do the following basic checks on extensions, not just when there's an associated purpose to the check: - if there is an unhandled critical extension (unless the user has chosen to ignore this fault) - if the path length has been exceeded (if one is set at all) - that certain extensions fit the associated purpose (if one has been given) --- apps/verify.c | 1 + crypto/x509/x509_txt.c | 2 ++ crypto/x509/x509_vfy.c | 74 +++++++++++++++++++++++++++++++++++------ crypto/x509/x509_vfy.h | 7 ++-- crypto/x509v3/v3_purp.c | 50 +++++++++++++--------------- crypto/x509v3/x509v3.h | 1 + 6 files changed, 95 insertions(+), 40 deletions(-) diff --git a/apps/verify.c b/apps/verify.c index f7c85b8dd..9ff32cb06 100644 --- a/apps/verify.c +++ b/apps/verify.c @@ -348,6 +348,7 @@ static int MS_CALLBACK cb(int ok, X509_STORE_CTX *ctx) if (ctx->error == X509_V_ERR_DEPTH_ZERO_SELF_SIGNED_CERT) ok=1; /* Continue after extension errors too */ if (ctx->error == X509_V_ERR_INVALID_CA) ok=1; + if (ctx->error == X509_V_ERR_INVALID_NON_CA) ok=1; if (ctx->error == X509_V_ERR_PATH_LENGTH_EXCEEDED) ok=1; if (ctx->error == X509_V_ERR_INVALID_PURPOSE) ok=1; if (ctx->error == X509_V_ERR_DEPTH_ZERO_SELF_SIGNED_CERT) ok=1; diff --git a/crypto/x509/x509_txt.c b/crypto/x509/x509_txt.c index ddc3b9b35..57ff33dc1 100644 --- a/crypto/x509/x509_txt.c +++ b/crypto/x509/x509_txt.c @@ -122,6 +122,8 @@ const char *X509_verify_cert_error_string(long n) return("certificate revoked"); case X509_V_ERR_INVALID_CA: return ("invalid CA certificate"); + case X509_V_ERR_INVALID_NON_CA: + return ("invalid non-CA certificate (has CA markings)"); case X509_V_ERR_PATH_LENGTH_EXCEEDED: return ("path length constraint exceeded"); case X509_V_ERR_INVALID_PURPOSE: diff --git a/crypto/x509/x509_vfy.c b/crypto/x509/x509_vfy.c index 1e66786bd..5e2cc82c5 100644 --- a/crypto/x509/x509_vfy.c +++ b/crypto/x509/x509_vfy.c @@ -73,7 +73,7 @@ static int null_callback(int ok,X509_STORE_CTX *e); static int check_issued(X509_STORE_CTX *ctx, X509 *x, X509 *issuer); static X509 *find_issuer(X509_STORE_CTX *ctx, STACK_OF(X509) *sk, X509 *x); -static int check_chain_purpose(X509_STORE_CTX *ctx); +static int check_chain_extensions(X509_STORE_CTX *ctx); static int check_trust(X509_STORE_CTX *ctx); static int check_revocation(X509_STORE_CTX *ctx); static int check_cert(X509_STORE_CTX *ctx); @@ -285,7 +285,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx) } /* We have the chain complete: now we need to check its purpose */ - if (param->purpose > 0) ok = check_chain_purpose(ctx); + ok = check_chain_extensions(ctx); if (!ok) goto end; @@ -381,15 +381,25 @@ static int get_issuer_sk(X509 **issuer, X509_STORE_CTX *ctx, X509 *x) * with the supplied purpose */ -static int check_chain_purpose(X509_STORE_CTX *ctx) +static int check_chain_extensions(X509_STORE_CTX *ctx) { #ifdef OPENSSL_NO_CHAIN_VERIFY return 1; #else - int i, ok=0; + int i, ok=0, must_be_ca; X509 *x; int (*cb)(); cb=ctx->verify_cb; + + /* must_be_ca can have 1 of 3 values: + -1: we accept both CA and non-CA certificates, to allow direct + use of self-signed certificates (which are marked as CA). + 0: we only accept non-CA certificates. This is currently not + used, but the possibility is present for future extensions. + 1: we only accept CA certificates. This is currently used for + all certificates in the chain except the leaf certificate. + */ + must_be_ca = -1; /* Check all untrusted certificates */ for (i = 0; i < ctx->last_untrusted; i++) { @@ -404,20 +414,62 @@ static int check_chain_purpose(X509_STORE_CTX *ctx) ok=cb(0,ctx); if (!ok) goto end; } - ret = X509_check_purpose(x, ctx->param->purpose, i); - if ((ret == 0) - || ((ctx->param->flags & X509_V_FLAG_X509_STRICT) - && (ret != 1))) + ret = X509_check_ca(x); + switch(must_be_ca) { - if (i) + case -1: + if ((ctx->param->flags & X509_V_FLAG_X509_STRICT) + && (ret != 1) && (ret != 0)) + { + ret = 0; ctx->error = X509_V_ERR_INVALID_CA; + } else - ctx->error = X509_V_ERR_INVALID_PURPOSE; + ret = 1; + break; + case 0: + if (ret != 0) + { + ret = 0; + ctx->error = X509_V_ERR_INVALID_NON_CA; + } + else + ret = 1; + break; + default: + if ((ret == 0) + || ((ctx->param->flags & X509_V_FLAG_X509_STRICT) + && (ret != 1))) + { + ret = 0; + ctx->error = X509_V_ERR_INVALID_CA; + } + else + ret = 1; + break; + } + if (ret == 0) + { ctx->error_depth = i; ctx->current_cert = x; ok=cb(0,ctx); if (!ok) goto end; } + if (ctx->param->purpose > 0) + { + ret = X509_check_purpose(x, ctx->param->purpose, + must_be_ca > 0); + if ((ret == 0) + || ((ctx->param->flags & X509_V_FLAG_X509_STRICT) + && (ret != 1))) + { + ctx->error = X509_V_ERR_INVALID_PURPOSE; + ctx->error_depth = i; + ctx->current_cert = x; + ok=cb(0,ctx); + if (!ok) goto end; + } + } /* Check pathlen */ if ((i > 1) && (x->ex_pathlen != -1) && (i > (x->ex_pathlen + 1))) @@ -428,6 +480,8 @@ static int check_chain_purpose(X509_STORE_CTX *ctx) ok=cb(0,ctx); if (!ok) goto end; } + /* The next certificate must be a CA */ + must_be_ca = 1; } ok = 1; end: diff --git a/crypto/x509/x509_vfy.h b/crypto/x509/x509_vfy.h index e1bd21b80..5f49c2a8b 100644 --- a/crypto/x509/x509_vfy.h +++ b/crypto/x509/x509_vfy.h @@ -322,10 +322,11 @@ void X509_STORE_CTX_set_depth(X509_STORE_CTX *ctx, int depth); #define X509_V_ERR_UNHANDLED_CRITICAL_EXTENSION 34 #define X509_V_ERR_KEYUSAGE_NO_CRL_SIGN 35 #define X509_V_ERR_UNHANDLED_CRITICAL_CRL_EXTENSION 36 +#define X509_V_ERR_INVALID_NON_CA 37 -#define X509_V_ERR_INVALID_EXTENSION 37 -#define X509_V_ERR_INVALID_POLICY_EXTENSION 38 -#define X509_V_ERR_NO_EXPLICIT_POLICY 39 +#define X509_V_ERR_INVALID_EXTENSION 38 +#define X509_V_ERR_INVALID_POLICY_EXTENSION 39 +#define X509_V_ERR_NO_EXPLICIT_POLICY 40 /* The application is not happy */ diff --git a/crypto/x509v3/v3_purp.c b/crypto/x509v3/v3_purp.c index 67596862a..d37e79481 100644 --- a/crypto/x509v3/v3_purp.c +++ b/crypto/x509v3/v3_purp.c @@ -63,7 +63,6 @@ static void x509v3_cache_extensions(X509 *x); -static int ca_check(const X509 *x); static int check_ssl_ca(const X509 *x); static int check_purpose_ssl_client(const X509_PURPOSE *xp, const X509 *x, int ca); static int check_purpose_ssl_server(const X509_PURPOSE *xp, const X509 *x, int ca); @@ -426,8 +425,14 @@ static void x509v3_cache_extensions(X509 *x) #define ns_reject(x, usage) \ (((x)->ex_flags & EXFLAG_NSCERT) && !((x)->ex_nscert & (usage))) -static int ca_check(const X509 *x) +int X509_check_ca(X509 *x) { + if(!(x->ex_flags & EXFLAG_SET)) { + CRYPTO_w_lock(CRYPTO_LOCK_X509); + x509v3_cache_extensions(x); + CRYPTO_w_unlock(CRYPTO_LOCK_X509); + } + /* keyUsage if present should allow cert signing */ if(ku_reject(x, KU_KEY_CERT_SIGN)) return 0; if(x->ex_flags & EXFLAG_BCONS) { @@ -435,10 +440,17 @@ static int ca_check(const X509 *x) /* If basicConstraints says not a CA then say so */ else return 0; } else { + /* we support V1 roots for... uh, I don't really know why. */ if((x->ex_flags & V1_ROOT) == V1_ROOT) return 3; /* If key usage present it must have certSign so tolerate it */ else if (x->ex_flags & EXFLAG_KUSAGE) return 4; - else return 2; + /* Older certificates could have Netscape-specific CA types */ + else if (x->ex_flags & EXFLAG_NSCERT + && x->ex_nscert & NS_ANY_CA) return 5; + /* 2 means "I don't know...", which is legal for V1 and V2 */ + else if (x->ex_flags & EXFLAG_V1) return 2; + /* can this still be regarded a CA certificate? I doubt it */ + return 0; } } @@ -446,14 +458,10 @@ static int ca_check(const X509 *x) static int check_ssl_ca(const X509 *x) { int ca_ret; - ca_ret = ca_check(x); + ca_ret = X509_check_ca(x); if(!ca_ret) return 0; /* check nsCertType if present */ - if(x->ex_flags & EXFLAG_NSCERT) { - if(x->ex_nscert & NS_SSL_CA) return ca_ret; - return 0; - } - if(ca_ret != 2) return ca_ret; + if(ca_ret != 5 || x->ex_nscert & NS_SSL_CA) return ca_ret; else return 0; } @@ -498,14 +506,10 @@ static int purpose_smime(const X509 *x, int ca) if(xku_reject(x,XKU_SMIME)) return 0; if(ca) { int ca_ret; - ca_ret = ca_check(x); + ca_ret = X509_check_ca(x); if(!ca_ret) return 0; /* check nsCertType if present */ - if(x->ex_flags & EXFLAG_NSCERT) { - if(x->ex_nscert & NS_SMIME_CA) return ca_ret; - return 0; - } - if(ca_ret != 2) return ca_ret; + if(ca_ret != 5 || x->ex_nscert & NS_SMIME_CA) return ca_ret; else return 0; } if(x->ex_flags & EXFLAG_NSCERT) { @@ -539,7 +543,7 @@ static int check_purpose_crl_sign(const X509_PURPOSE *xp, const X509 *x, int ca) { if(ca) { int ca_ret; - if((ca_ret = ca_check(x)) != 2) return ca_ret; + if((ca_ret = X509_check_ca(x)) != 2) return ca_ret; else return 0; } if(ku_reject(x, KU_CRL_SIGN)) return 0; @@ -552,17 +556,9 @@ static int check_purpose_crl_sign(const X509_PURPOSE *xp, const X509 *x, int ca) static int ocsp_helper(const X509_PURPOSE *xp, const X509 *x, int ca) { - /* Must be a valid CA */ - if(ca) { - int ca_ret; - ca_ret = ca_check(x); - if(ca_ret != 2) return ca_ret; - if(x->ex_flags & EXFLAG_NSCERT) { - if(x->ex_nscert & NS_ANY_CA) return ca_ret; - return 0; - } - return 0; - } + /* Must be a valid CA. Should we really support the "I don't know" + value (2)? */ + if(ca) return X509_check_ca(x); /* leaf certificate is checked in OCSP_verify() */ return 1; } diff --git a/crypto/x509v3/x509v3.h b/crypto/x509v3/x509v3.h index 677a930f7..a6436289c 100644 --- a/crypto/x509v3/x509v3.h +++ b/crypto/x509v3/x509v3.h @@ -578,6 +578,7 @@ int X509V3_EXT_print_fp(FILE *out, X509_EXTENSION *ext, int flag, int indent); int X509V3_extensions_print(BIO *out, char *title, STACK_OF(X509_EXTENSION) *exts, unsigned long flag, int indent); +int X509_check_ca(X509 *x); int X509_check_purpose(X509 *x, int id, int ca); int X509_supported_extension(X509_EXTENSION *ex); int X509_PURPOSE_set(int *p, int purpose); From 5022e4ecdf228dd79c9fc355a7b5047adbf9d414 Mon Sep 17 00:00:00 2001 From: Richard Levitte Date: Mon, 29 Nov 2004 11:57:00 +0000 Subject: [PATCH 07/12] Document the change. --- CHANGES | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index 1227d35e2..47ffdcfde 100644 --- a/CHANGES +++ b/CHANGES @@ -743,7 +743,21 @@ differing sizes. [Richard Levitte] - Changes between 0.9.7d and 0.9.7e [XX xxx XXXX] + Changes between 0.9.7e and 0.9.7f [XX xxx XXXX] + + *) Make an explicit check during certificate validation to see that + the CA setting in each certificate on the chain is correct. As a + side effect always do the following basic checks on extensions, + not just when there's an associated purpose to the check: + + - if there is an unhandled critical extension (unless the user + has chosen to ignore this fault) + - if the path length has been exceeded (if one is set at all) + - that certain extensions fit the associated purpose (if one has + been given) + [Richard Levitte] + + Changes between 0.9.7d and 0.9.7e [25 Oct 2004] *) Avoid a race condition when CRLs are checked in a multi threaded environment. This would happen due to the reordering of the revoked From 7a3240e319b883c49c683387128c528957dd98e0 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 29 Nov 2004 21:12:58 +0000 Subject: [PATCH 08/12] Final touches to rc4/asm/rc4-596.pl, +52% better performance on AMD core. --- crypto/rc4/asm/rc4-586.pl | 48 ++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl index dbe3803f5..977a9f123 100644 --- a/crypto/rc4/asm/rc4-586.pl +++ b/crypto/rc4/asm/rc4-586.pl @@ -1,6 +1,25 @@ #!/usr/local/bin/perl -# define for pentium pro friendly version +# At some point it became apparent that the original SSLeay RC4 +# assembler implementation performs suboptimal on latest IA-32 +# microarchitectures. After re-tuning performance has changed as +# following: +# +# Pentium +0% +# Pentium III +17% +# AMD +52%(*) +# P4 +180%(**) +# +# (*) This number is actually a trade-off:-) It's possible to +# achieve +72%, but at the cost of -48% off PIII performance. +# In other words code performing further 13% faster on AMD +# would perform almost 2 times slower on Intel PIII... +# For reference! This code delivers ~80% of rc4-amd64.pl +# performance on same Opteron machine. +# (**) This number requires compressed key schedule set up by +# RC4_set_key, see commentary section in rc4_skey.c for +# further details. +# push(@INC,"perlasm","../../perlasm"); require "x86asm.pl"; @@ -46,20 +65,16 @@ sub RC4_loop # Moved out # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0; - &add( $y, $tx); - &and( $y, 0xff); - &inc( $x); # NEXT ROUND + &add( &LB($y), &LB($tx)); + &inc( &LB($x)); # NEXT ROUND &mov( $ty, &DWP(0,$d,$y,4)); # XXX &mov( &DWP(-4,$d,$x,4),$ty); # AGI &add( $ty, $tx); - &and( $x, 0xff); # NEXT ROUND - &and( $ty, 0xff); &mov( &DWP(0,$d,$y,4),$tx); - &nop(); - &mov( $ty, &DWP(0,$d,$ty,4)); - &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND - # XXX + &and( $ty, 0xff); + &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND + &mov( $ty, &DWP(0,$d,$ty,4)); if (!$char) { @@ -99,19 +114,20 @@ sub RC4 &push("ebp"); &push("ebx"); &push("esi"); - &push("edi"); + &xor( $x, $x); # avoid partial register stalls + &push("edi"); + &xor( $y, $y); # avoid partial register stalls &mov( $d, &wparam(0)); # key &mov( $in, &wparam(2)); - &mov( $x, &DWP(0,$d,"",1)); - &mov( $y, &DWP(4,$d,"",1)); + &movb( &LB($x), &BP(0,$d,"",1)); + &movb( &LB($y), &BP(4,$d,"",1)); &mov( $out, &wparam(3)); - &inc( $x); + &inc( &LB($x)); &stack_push(3); # 3 temp variables &add( $d, 8); - &and( $x, 0xff); # detect compressed schedule, see commentary section in rc4_skey.c... &cmp(&DWP(256,$d),-1); @@ -200,7 +216,7 @@ sub RC4 &set_label("finished"); &dec( $x); &stack_pop(3); - &mov( &DWP(-4,$d,"",0),$y); + &movb( &BP(-4,$d,"",0),&LB($y)); &movb( &BP(-8,$d,"",0),&LB($x)); &function_end($name); From fc7fc5678f69a4f9bb0b155d9d45f70fc545f626 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 29 Nov 2004 21:19:56 +0000 Subject: [PATCH 09/12] sha1_block_asm_data_order can't hash if message crosses 2GB boundary. --- crypto/sha/asm/sha1-586.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl index 9fe1a7554..4f8521f1e 100644 --- a/crypto/sha/asm/sha1-586.pl +++ b/crypto/sha/asm/sha1-586.pl @@ -410,7 +410,7 @@ sub sha1_block_data &mov(&DWP(16,$tmp1,"",0),$E); &cmp("esi","eax"); &mov(&DWP( 4,$tmp1,"",0),$B); - &jl(&label("start")); + &jb(&label("start")); &stack_pop(18+9); &pop("edi"); From 5073ff03463a3e21f4acfcdcfa0c1eda64145007 Mon Sep 17 00:00:00 2001 From: Richard Levitte Date: Tue, 30 Nov 2004 12:18:55 +0000 Subject: [PATCH 10/12] Split X509_check_ca() into a small self and an internal function check_ca(), to resolve constness issue. check_ca() is called from the purpose checkers instead of X509_check_ca(), since the stuff done by the latter (except for calling check_ca()) is also done by X509_check_purpose(). --- crypto/x509v3/v3_purp.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/crypto/x509v3/v3_purp.c b/crypto/x509v3/v3_purp.c index d37e79481..8d0ebbeae 100644 --- a/crypto/x509v3/v3_purp.c +++ b/crypto/x509v3/v3_purp.c @@ -425,14 +425,8 @@ static void x509v3_cache_extensions(X509 *x) #define ns_reject(x, usage) \ (((x)->ex_flags & EXFLAG_NSCERT) && !((x)->ex_nscert & (usage))) -int X509_check_ca(X509 *x) +static int check_ca(const X509 *x) { - if(!(x->ex_flags & EXFLAG_SET)) { - CRYPTO_w_lock(CRYPTO_LOCK_X509); - x509v3_cache_extensions(x); - CRYPTO_w_unlock(CRYPTO_LOCK_X509); - } - /* keyUsage if present should allow cert signing */ if(ku_reject(x, KU_KEY_CERT_SIGN)) return 0; if(x->ex_flags & EXFLAG_BCONS) { @@ -454,11 +448,22 @@ int X509_check_ca(X509 *x) } } +int X509_check_ca(X509 *x) +{ + if(!(x->ex_flags & EXFLAG_SET)) { + CRYPTO_w_lock(CRYPTO_LOCK_X509); + x509v3_cache_extensions(x); + CRYPTO_w_unlock(CRYPTO_LOCK_X509); + } + + return check_ca(x); +} + /* Check SSL CA: common checks for SSL client and server */ static int check_ssl_ca(const X509 *x) { int ca_ret; - ca_ret = X509_check_ca(x); + ca_ret = check_ca(x); if(!ca_ret) return 0; /* check nsCertType if present */ if(ca_ret != 5 || x->ex_nscert & NS_SSL_CA) return ca_ret; @@ -506,7 +511,7 @@ static int purpose_smime(const X509 *x, int ca) if(xku_reject(x,XKU_SMIME)) return 0; if(ca) { int ca_ret; - ca_ret = X509_check_ca(x); + ca_ret = check_ca(x); if(!ca_ret) return 0; /* check nsCertType if present */ if(ca_ret != 5 || x->ex_nscert & NS_SMIME_CA) return ca_ret; @@ -543,7 +548,7 @@ static int check_purpose_crl_sign(const X509_PURPOSE *xp, const X509 *x, int ca) { if(ca) { int ca_ret; - if((ca_ret = X509_check_ca(x)) != 2) return ca_ret; + if((ca_ret = check_ca(x)) != 2) return ca_ret; else return 0; } if(ku_reject(x, KU_CRL_SIGN)) return 0; @@ -558,7 +563,7 @@ static int ocsp_helper(const X509_PURPOSE *xp, const X509 *x, int ca) { /* Must be a valid CA. Should we really support the "I don't know" value (2)? */ - if(ca) return X509_check_ca(x); + if(ca) return check_ca(x); /* leaf certificate is checked in OCSP_verify() */ return 1; } From e6e1f4cb5e37f77fe61ff568dd2904f21ec5b82c Mon Sep 17 00:00:00 2001 From: "Mark J. Cox" Date: Tue, 30 Nov 2004 14:34:16 +0000 Subject: [PATCH 11/12] Mention that the keys likely to have signed the distribution are now listed on the web site for easy finding and downloading --- FAQ | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/FAQ b/FAQ index 463bc1e13..fd354b3b4 100644 --- a/FAQ +++ b/FAQ @@ -152,7 +152,8 @@ Use MD5 to check that a tarball from a mirror site is identical: md5sum TARBALL | awk '{print $1;}' | cmp - TARBALL.md5 You can check authenticity using pgp or gpg. You need the OpenSSL team -member public key used to sign it (download it from a key server). Then +member public key used to sign it (download it from a key server, see a +list of keys at ). Then just do: pgp TARBALL.asc From b7b46c9a87c9fe7275a84c5ecb9f5f3459d7b307 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 30 Nov 2004 15:46:46 +0000 Subject: [PATCH 12/12] Add 0.9.7 specific comments to RC4 assembler modules. --- crypto/rc4/asm/rc4-586.pl | 15 ++++++++++----- crypto/rc4/asm/rc4-amd64.pl | 4 +++- crypto/rc4/asm/rc4-ia64.S | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl index 977a9f123..07b2bc6fc 100644 --- a/crypto/rc4/asm/rc4-586.pl +++ b/crypto/rc4/asm/rc4-586.pl @@ -1,7 +1,7 @@ #!/usr/local/bin/perl # At some point it became apparent that the original SSLeay RC4 -# assembler implementation performs suboptimal on latest IA-32 +# assembler implementation performs suboptimaly on latest IA-32 # microarchitectures. After re-tuning performance has changed as # following: # @@ -15,10 +15,12 @@ # In other words code performing further 13% faster on AMD # would perform almost 2 times slower on Intel PIII... # For reference! This code delivers ~80% of rc4-amd64.pl -# performance on same Opteron machine. +# performance on the same Opteron machine. # (**) This number requires compressed key schedule set up by -# RC4_set_key, see commentary section in rc4_skey.c for -# further details. +# RC4_set_key and therefore doesn't apply to 0.9.7 [option for +# compressed key schedule is implemented in 0.9.8 and later, +# see commentary section in rc4_skey.c for further details]. +# # push(@INC,"perlasm","../../perlasm"); @@ -130,6 +132,8 @@ sub RC4 &add( $d, 8); # detect compressed schedule, see commentary section in rc4_skey.c... + # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant, + # as compressed key schedule is set up in 0.9.8 and later. &cmp(&DWP(256,$d),-1); &je(&label("RC4_CHAR")); @@ -190,7 +194,8 @@ sub RC4 &jmp(&label("finished")); &align(16); - # this is essentially Intel P4 specific codepath, see rc4_skey.c... + # this is essentially Intel P4 specific codepath, see rc4_skey.c, + # and is engaged in 0.9.8 and later context... &set_label("RC4_CHAR"); &lea ($ty,&DWP(0,$in,$ty)); diff --git a/crypto/rc4/asm/rc4-amd64.pl b/crypto/rc4/asm/rc4-amd64.pl index 35e426d56..9e0da8af9 100755 --- a/crypto/rc4/asm/rc4-amd64.pl +++ b/crypto/rc4/asm/rc4-amd64.pl @@ -30,7 +30,9 @@ # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to # compose blended code, which would perform even within 30% marginal # on either AMD and Intel platforms, I implement both cases. See -# rc4_skey.c for further details... +# rc4_skey.c for further details... This applies to 0.9.8 and later. +# In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes +# of code remain redundant. $output=shift; diff --git a/crypto/rc4/asm/rc4-ia64.S b/crypto/rc4/asm/rc4-ia64.S index 4af7fba7b..ae84af672 100644 --- a/crypto/rc4/asm/rc4-ia64.S +++ b/crypto/rc4/asm/rc4-ia64.S @@ -18,7 +18,7 @@ // to input and output streams. Secondly, less obvious, it's possible // to pull up some references to elements of the key schedule itself. // Fact is that such prior loads are not safe only for "degenerated" -// key schedule, when all elements equal to the same value, which is +// key schedule, when some elements equal to the same value, which is // never the case [key schedule setup routine makes sure it's not]. // Furthermore. In order to compress loop body to the minimum, I chose // to deploy deposit instruction, which substitutes for the whole