From 376729e1301f82a8f20ce78f36b7107c75720a7c Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 21 Nov 2004 10:36:25 +0000 Subject: [PATCH] RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). --- Configure | 2 +- TABLE | 4 +- crypto/amd64cpuid.pl | 115 ++++++++++++++++++++++++++++++++- crypto/perlasm/x86unix.pl | 11 ++-- crypto/rc4/asm/rc4-586.pl | 59 +++++++++++++---- crypto/rc4/asm/rc4-amd64.pl | 29 +++++++++ crypto/rc4/rc4_locl.h | 1 + crypto/rc4/rc4_skey.c | 47 ++++++++++++-- doc/crypto/OPENSSL_ia32cap.pod | 7 +- 9 files changed, 242 insertions(+), 33 deletions(-) diff --git a/Configure b/Configure index cc91c3dcb..cce2af2b8 100755 --- a/Configure +++ b/Configure @@ -318,7 +318,7 @@ my %table=( "linux-s390x", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", -"linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL::asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", +"linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL:amd64cpuid.o:asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-elf-arm","gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-parisc", "gcc:-DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DBN_DIV2W::-D_REENTRANT:::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::", #### SPARC Linux setups diff --git a/TABLE b/TABLE index 2910ab3f1..e8a258608 100644 --- a/TABLE +++ b/TABLE @@ -2086,7 +2086,7 @@ $unistd = $thread_cflag = -D_REENTRANT $sys_id = $lflags = -rdynamic -ldl -$bn_ops = DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT +$bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o @@ -3572,7 +3572,7 @@ $thread_cflag = -D_REENTRANT $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL -$cpuid_obj = +$cpuid_obj = amd64cpuid.o $bn_obj = asm/x86_64-gcc.o $des_obj = $aes_obj = diff --git a/crypto/amd64cpuid.pl b/crypto/amd64cpuid.pl index baf801d06..097f6b8d5 100644 --- a/crypto/amd64cpuid.pl +++ b/crypto/amd64cpuid.pl @@ -5,16 +5,63 @@ $win64a=1 if ($output =~ /win64a\.[s|asm]/); open STDOUT,">$output" || die "can't open $output: $!"; print<<___ if(defined($win64a)); -TEXT SEGMENT +_TEXT SEGMENT PUBLIC OPENSSL_rdtsc ALIGN 16 -OPENSSL_rdtsc PROC NEAR +OPENSSL_rdtsc PROC rdtsc shl rdx,32 or rax,rdx ret OPENSSL_rdtsc ENDP -TEXT ENDS + +PUBLIC OPENSSL_atomic_add +ALIGN 16 +OPENSSL_atomic_add PROC + mov eax,DWORD PTR[rcx] +\$Lspin: lea r8,DWORD PTR[rdx+rax] +lock cmpxchg DWORD PTR[rcx],r8d + jne \$Lspin + mov eax,r8d + cdqe + ret +OPENSSL_atomic_add ENDP + +PUBLIC OPENSSL_wipe_cpu +ALIGN 16 +OPENSSL_wipe_cpu PROC + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + xor rcx,rcx + xor rdx,rdx + xor r8,r8 + xor r9,r9 + xor r10,r10 + xor r11,r11 + lea rax,QWORD PTR[rsp+8] + ret +OPENSSL_wipe_cpu ENDP + +OPENSSL_ia32_cpuid PROC + mov r8,rbx + mov eax,1 + cpuid + shl rcx,32 + mov eax,edx + mov rbx,r8 + or rax,rcx + ret +OPENSSL_ia32_cpuid ENDP +_TEXT ENDS + +CRT\$XIU SEGMENT +EXTRN OPENSSL_cpuid_setup:PROC +DQ OPENSSL_cpuid_setup +CRT\$XIU ENDS END ___ print<<___ if(!defined($win64a)); @@ -27,4 +74,66 @@ OPENSSL_rdtsc: or %rdx,%rax ret .size OPENSSL_rdtsc,.-OPENSSL_rdtsc + +.globl OPENSSL_atomic_add +.type OPENSSL_atomic_add,\@function +.align 16 +OPENSSL_atomic_add: + movl (%rdi),%eax +.Lspin: lea (%rsi,%rax),%r8 +lock; cmpxchg %r8d,(%rdi) + jne .Lspin + mov %r8d,%eax + cdqe + ret +.size OPENSSL_atomic_add,.-OPENSSL_atomic_add + +.globl OPENSSL_wipe_cpu +.type OPENSSL_wipe_cpu,\@function +.align 16 +OPENSSL_wipe_cpu: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + xor %rcx,%rcx + xor %rdx,%rdx + xor %rsi,%rsi + xor %rdi,%rdi + xor %r8,%r8 + xor %r9,%r9 + xor %r10,%r10 + xor %r11,%r11 + lea 8(%rsp),%rax + ret +.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu + +.globl OPENSSL_ia32_cpuid +.align 16 +OPENSSL_ia32_cpuid: + mov %rbx,%r8 + mov \$1,%eax + cpuid + shl \$32,%rcx + mov %edx,%eax + mov %r8,%rbx + or %rcx,%rax + ret +.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid + +.section .init + call OPENSSL_cpuid_setup + .align 16 ___ diff --git a/crypto/perlasm/x86unix.pl b/crypto/perlasm/x86unix.pl index 7d87eb170..867fa09e4 100644 --- a/crypto/perlasm/x86unix.pl +++ b/crypto/perlasm/x86unix.pl @@ -161,7 +161,7 @@ sub main'shl { &out2("sall",@_); } sub main'shr { &out2("shrl",@_); } sub main'xor { &out2("xorl",@_); } sub main'xorb { &out2("xorb",@_); } -sub main'add { &out2("addl",@_); } +sub main'add { &out2($_[0]=~/%[a-d][lh]/?"addb":"addl",@_); } sub main'adc { &out2("adcl",@_); } sub main'sub { &out2("subl",@_); } sub main'sbb { &out2("sbbl",@_); } @@ -189,7 +189,7 @@ sub main'jc { &out1("jc",@_); } sub main'jnc { &out1("jnc",@_); } sub main'jno { &out1("jno",@_); } sub main'dec { &out1("decl",@_); } -sub main'inc { &out1("incl",@_); } +sub main'inc { &out1($_[0]=~/%[a-d][hl]/?"incb":"incl",@_); } sub main'push { &out1("pushl",@_); $stack+=4; } sub main'pop { &out1("popl",@_); $stack-=4; } sub main'pushf { &out0("pushfl"); $stack+=4; } @@ -205,9 +205,10 @@ sub main'nop { &out0("nop"); } sub main'test { &out2("testl",@_); } sub main'bt { &out2("btl",@_); } sub main'leave { &out0("leave"); } -sub main'cpuid { &out0(".byte 0x0f; .byte 0xa2"); } -sub main'rdtsc { &out0(".byte 0x0f; .byte 0x31"); } +sub main'cpuid { &out0(".byte\t0x0f,0xa2"); } +sub main'rdtsc { &out0(".byte\t0x0f,0x31"); } sub main'halt { &out0("hlt"); } +sub main'movz { &out2("movzb",@_); } # SSE2 sub main'emms { &out0("emms"); } @@ -558,7 +559,7 @@ sub main'file_end pushl %ebx movl %edx,%edi movl \$1,%eax - .byte 0x0f; .byte 0xa2 + .byte 0x0f,0xa2 orl \$1<<10,%edx movl %edx,0(%edi) popl %ebx diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl index 7ef889e5a..dbe3803f5 100644 --- a/crypto/rc4/asm/rc4-586.pl +++ b/crypto/rc4/asm/rc4-586.pl @@ -7,10 +7,10 @@ require "x86asm.pl"; &asm_init($ARGV[0],"rc4-586.pl"); -$tx="eax"; -$ty="ebx"; -$x="ecx"; -$y="edx"; +$x="eax"; +$y="ebx"; +$tx="ecx"; +$ty="edx"; $in="esi"; $out="edi"; $d="ebp"; @@ -31,7 +31,7 @@ sub RC4_loop { &mov($ty, &swtmp(2)); &cmp($ty, $in); - &jle(&label("finished")); + &jbe(&label("finished")); &inc($in); } else @@ -39,7 +39,7 @@ sub RC4_loop &add($ty, 8); &inc($in); &cmp($ty, $in); - &jl(&label("finished")); + &jb(&label("finished")); &mov(&swtmp(2), $ty); } } @@ -88,35 +88,44 @@ sub RC4 &function_begin_B($name,""); + &mov($ty,&wparam(1)); # len + &cmp($ty,0); + &jne(&label("proceed")); + &ret(); + &set_label("proceed"); + &comment(""); &push("ebp"); &push("ebx"); - &mov( $d, &wparam(0)); # key - &mov( $ty, &wparam(1)); # num &push("esi"); &push("edi"); + &mov( $d, &wparam(0)); # key + &mov( $in, &wparam(2)); &mov( $x, &DWP(0,$d,"",1)); &mov( $y, &DWP(4,$d,"",1)); - &mov( $in, &wparam(2)); + &mov( $out, &wparam(3)); &inc( $x); &stack_push(3); # 3 temp variables &add( $d, 8); &and( $x, 0xff); + # detect compressed schedule, see commentary section in rc4_skey.c... + &cmp(&DWP(256,$d),-1); + &je(&label("RC4_CHAR")); + &lea( $ty, &DWP(-8,$ty,$in)); # check for 0 length input - &mov( $out, &wparam(3)); &mov( &swtmp(2), $ty); # this is now address to exit at &mov( $tx, &DWP(0,$d,$x,4)); &cmp( $ty, $in); - &jl( &label("end")); # less than 8 bytes + &jb( &label("end")); # less than 8 bytes &set_label("start"); @@ -148,7 +157,7 @@ sub RC4 &mov( &DWP(-4,$out,"",0), $tx); &mov( $tx, &DWP(0,$d,$x,4)); &cmp($in, $ty); - &jle(&label("start")); + &jbe(&label("start")); &set_label("end"); @@ -162,6 +171,32 @@ sub RC4 &RC4_loop(5,0,1); &RC4_loop(6,1,1); + &jmp(&label("finished")); + + &align(16); + # this is essentially Intel P4 specific codepath, see rc4_skey.c... + &set_label("RC4_CHAR"); + + &lea ($ty,&DWP(0,$in,$ty)); + &mov (&swtmp(2),$ty); + + # strangely enough unrolled loop performs over 20% slower... + &set_label("RC4_CHAR_loop"); + &movz ($tx,&BP(0,$d,$x)); + &add (&LB($y),&LB($tx)); + &movz ($ty,&BP(0,$d,$y)); + &movb (&BP(0,$d,$y),&LB($tx)); + &movb (&BP(0,$d,$x),&LB($ty)); + &add (&LB($ty),&LB($tx)); + &movz ($ty,&BP(0,$d,$ty)); + &xorb (&LB($ty),&BP(0,$in)); + &movb (&BP(0,$out),&LB($ty)); + &inc (&LB($x)); + &inc ($in); + &inc ($out); + &cmp ($in,&swtmp(2)); + &jb (&label("RC4_CHAR_loop")); + &set_label("finished"); &dec( $x); &stack_pop(3); diff --git a/crypto/rc4/asm/rc4-amd64.pl b/crypto/rc4/asm/rc4-amd64.pl index cc3f0c023..35e426d56 100755 --- a/crypto/rc4/asm/rc4-amd64.pl +++ b/crypto/rc4/asm/rc4-amd64.pl @@ -25,6 +25,13 @@ # Latter means that if you want to *estimate* what to expect from # *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz. +# Intel P4 EM64T core was found to run the AMD64 code really slow... +# The only way to achieve comparable performance on P4 is to keep +# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to +# compose blended code, which would perform even within 30% marginal +# on either AMD and Intel platforms, I implement both cases. See +# rc4_skey.c for further details... + $output=shift; $win64a=1 if ($output =~ /win64a.[s|asm]/); @@ -90,6 +97,8 @@ $code.=<<___; add \$8,$dat movl `&PTR("DWORD:-8[$dat]")`,$XX#d movl `&PTR("DWORD:-4[$dat]")`,$YY#d + cmpl \$-1,`&PTR("DWORD:256[$dat]")` + je .LRC4_CHAR test \$-8,$len jz .Lloop1 .align 16 @@ -167,6 +176,24 @@ $code.=<<___; dec $len jnz .Lloop1 jmp .Lexit + +.align 16 +.LRC4_CHAR: + inc $XX#b + movzb `&PTR("BYTE:[$dat+$XX]")`,$TX#d + add $TX#b,$YY#b + movzb `&PTR("BYTE:[$dat+$YY]")`,$TY#d + movb $TX#b,`&PTR("BYTE:[$dat+$YY]")` + movb $TY#b,`&PTR("BYTE:[$dat+$XX]")` + add $TX#b,$TY#b + movzb `&PTR("BYTE:[$dat+$TY]")`,$TY#d + xorb `&PTR("BYTE:[$inp]")`,$TY#b + movb $TY#b,`&PTR("BYTE:[$out]")` + inc $inp + inc $out + dec $len + jnz .LRC4_CHAR + jmp .Lexit ___ $code.=<<___ if (defined($win64a)); RC4 ENDP @@ -189,6 +216,8 @@ if (defined($win64a)) { $code =~ s/mov[bwlq]/mov/gm; $code =~ s/movzb/movzx/gm; $code =~ s/repret/DB\t0F3h,0C3h/gm; + $code =~ s/cmpl/cmp/gm; + $code =~ s/xorb/xor/gm; } else { $code =~ s/([QD]*WORD|BYTE)://gm; $code =~ s/repret/.byte\t0xF3,0xC3/gm; diff --git a/crypto/rc4/rc4_locl.h b/crypto/rc4/rc4_locl.h index 3bb80b6ce..c712e1632 100644 --- a/crypto/rc4/rc4_locl.h +++ b/crypto/rc4/rc4_locl.h @@ -1,4 +1,5 @@ #ifndef HEADER_RC4_LOCL_H #define HEADER_RC4_LOCL_H #include +#include #endif diff --git a/crypto/rc4/rc4_skey.c b/crypto/rc4/rc4_skey.c index bb10c1ebe..781ff2d8b 100644 --- a/crypto/rc4/rc4_skey.c +++ b/crypto/rc4/rc4_skey.c @@ -93,25 +93,58 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data) unsigned int i; d= &(key->data[0]); - for (i=0; i<256; i++) - d[i]=i; key->x = 0; key->y = 0; id1=id2=0; -#define SK_LOOP(n) { \ +#define SK_LOOP(d,n) { \ tmp=d[(n)]; \ id2 = (data[id1] + tmp + id2) & 0xff; \ if (++id1 == len) id1=0; \ d[(n)]=d[id2]; \ d[id2]=tmp; } +#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) +# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__INTEL__) || \ + defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) + if (sizeof(RC4_INT) > 1) { + /* + * Unlike all other x86 [and x86_64] implementations, + * Intel P4 core [including EM64T] was found to perform + * poorly with wider RC4_INT. Performance improvement + * for IA-32 hand-coded assembler turned out to be 2.8x + * if re-coded for RC4_CHAR! It's however inappropriate + * to just switch to RC4_CHAR for x86[_64], as non-P4 + * implementations suffer from significant performance + * losses then, e.g. PIII exhibits >2x deterioration, + * and so does Opteron. In order to assure optimal + * all-round performance, let us [try to] detect P4 at + * run-time by checking upon HTT bit in CPU capability + * vector and set up compressed key schedule, which is + * recognized by correspondingly updated assembler + * module... + * + */ + if (OPENSSL_ia32cap_P & (1<<28)) { + unsigned char *cp=(unsigned char *)d; + + for (i=0;i<256;i++) cp[i]=i; + for (i=0;i<256;i++) SK_LOOP(cp,i); + /* mark schedule as compressed! */ + d[256/sizeof(RC4_INT)]=-1; + return; + } + } +# endif +#endif + for (i=0; i < 256; i++) d[i]=i; for (i=0; i < 256; i+=4) { - SK_LOOP(i+0); - SK_LOOP(i+1); - SK_LOOP(i+2); - SK_LOOP(i+3); + SK_LOOP(d,i+0); + SK_LOOP(d,i+1); + SK_LOOP(d,i+2); + SK_LOOP(d,i+3); } } diff --git a/doc/crypto/OPENSSL_ia32cap.pod b/doc/crypto/OPENSSL_ia32cap.pod index 790e8e9b1..ec6b655c1 100644 --- a/doc/crypto/OPENSSL_ia32cap.pod +++ b/doc/crypto/OPENSSL_ia32cap.pod @@ -14,11 +14,12 @@ OPENSSL_ia32cap Value returned by OPENSSL_ia32cap_loc() is address of a variable containing IA-32 processor capabilities bit vector as it appears in EDX register after executing CPUID instruction with EAX=1 input value (see -Intel Application Note #241618). Naturally it's meaningful on IA-32 +Intel Application Note #241618). Naturally it's meaningful on IA-32[E] platforms only. The variable is normally set up automatically upon toolkit initialization, but can be manipulated afterwards to modify -crypto library behaviour. For the moment of this writing only two bits -are significant, namely bit #26 denoting SSE2 support, and bit #4 +crypto library behaviour. For the moment of this writing three bits are +significant, namely bit #28 denoting Hyperthreading, which is used to +distinguish Intel P4 core, bit #26 denoting SSE2 support, and bit #4 denoting presence of Time-Stamp Counter. Clearing bit #26 at run-time for example disables high-performance SSE2 code present in the crypto library. You might have to do this if target OpenSSL application is