RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's
apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core).
This commit is contained in:
parent
00dd8f6d6e
commit
376729e130
@ -318,7 +318,7 @@ my %table=(
|
||||
"linux-s390x", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL::asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL:amd64cpuid.o:asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"linux-elf-arm","gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"linux-parisc", "gcc:-DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DBN_DIV2W::-D_REENTRANT:::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::",
|
||||
#### SPARC Linux setups
|
||||
|
4
TABLE
4
TABLE
@ -2086,7 +2086,7 @@ $unistd =
|
||||
$thread_cflag = -D_REENTRANT
|
||||
$sys_id =
|
||||
$lflags = -rdynamic -ldl
|
||||
$bn_ops = DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
|
||||
$bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
|
||||
$cpuid_obj = x86cpuid-elf.o
|
||||
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
|
||||
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
|
||||
@ -3572,7 +3572,7 @@ $thread_cflag = -D_REENTRANT
|
||||
$sys_id =
|
||||
$lflags = -ldl
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL
|
||||
$cpuid_obj =
|
||||
$cpuid_obj = amd64cpuid.o
|
||||
$bn_obj = asm/x86_64-gcc.o
|
||||
$des_obj =
|
||||
$aes_obj =
|
||||
|
@ -5,16 +5,63 @@ $win64a=1 if ($output =~ /win64a\.[s|asm]/);
|
||||
open STDOUT,">$output" || die "can't open $output: $!";
|
||||
|
||||
print<<___ if(defined($win64a));
|
||||
TEXT SEGMENT
|
||||
_TEXT SEGMENT
|
||||
PUBLIC OPENSSL_rdtsc
|
||||
ALIGN 16
|
||||
OPENSSL_rdtsc PROC NEAR
|
||||
OPENSSL_rdtsc PROC
|
||||
rdtsc
|
||||
shl rdx,32
|
||||
or rax,rdx
|
||||
ret
|
||||
OPENSSL_rdtsc ENDP
|
||||
TEXT ENDS
|
||||
|
||||
PUBLIC OPENSSL_atomic_add
|
||||
ALIGN 16
|
||||
OPENSSL_atomic_add PROC
|
||||
mov eax,DWORD PTR[rcx]
|
||||
\$Lspin: lea r8,DWORD PTR[rdx+rax]
|
||||
lock cmpxchg DWORD PTR[rcx],r8d
|
||||
jne \$Lspin
|
||||
mov eax,r8d
|
||||
cdqe
|
||||
ret
|
||||
OPENSSL_atomic_add ENDP
|
||||
|
||||
PUBLIC OPENSSL_wipe_cpu
|
||||
ALIGN 16
|
||||
OPENSSL_wipe_cpu PROC
|
||||
pxor xmm0,xmm0
|
||||
pxor xmm1,xmm1
|
||||
pxor xmm2,xmm2
|
||||
pxor xmm3,xmm3
|
||||
pxor xmm4,xmm4
|
||||
pxor xmm5,xmm5
|
||||
xor rcx,rcx
|
||||
xor rdx,rdx
|
||||
xor r8,r8
|
||||
xor r9,r9
|
||||
xor r10,r10
|
||||
xor r11,r11
|
||||
lea rax,QWORD PTR[rsp+8]
|
||||
ret
|
||||
OPENSSL_wipe_cpu ENDP
|
||||
|
||||
OPENSSL_ia32_cpuid PROC
|
||||
mov r8,rbx
|
||||
mov eax,1
|
||||
cpuid
|
||||
shl rcx,32
|
||||
mov eax,edx
|
||||
mov rbx,r8
|
||||
or rax,rcx
|
||||
ret
|
||||
OPENSSL_ia32_cpuid ENDP
|
||||
_TEXT ENDS
|
||||
|
||||
CRT\$XIU SEGMENT
|
||||
EXTRN OPENSSL_cpuid_setup:PROC
|
||||
DQ OPENSSL_cpuid_setup
|
||||
CRT\$XIU ENDS
|
||||
END
|
||||
___
|
||||
print<<___ if(!defined($win64a));
|
||||
@ -27,4 +74,66 @@ OPENSSL_rdtsc:
|
||||
or %rdx,%rax
|
||||
ret
|
||||
.size OPENSSL_rdtsc,.-OPENSSL_rdtsc
|
||||
|
||||
.globl OPENSSL_atomic_add
|
||||
.type OPENSSL_atomic_add,\@function
|
||||
.align 16
|
||||
OPENSSL_atomic_add:
|
||||
movl (%rdi),%eax
|
||||
.Lspin: lea (%rsi,%rax),%r8
|
||||
lock; cmpxchg %r8d,(%rdi)
|
||||
jne .Lspin
|
||||
mov %r8d,%eax
|
||||
cdqe
|
||||
ret
|
||||
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
|
||||
|
||||
.globl OPENSSL_wipe_cpu
|
||||
.type OPENSSL_wipe_cpu,\@function
|
||||
.align 16
|
||||
OPENSSL_wipe_cpu:
|
||||
pxor %xmm0,%xmm0
|
||||
pxor %xmm1,%xmm1
|
||||
pxor %xmm2,%xmm2
|
||||
pxor %xmm3,%xmm3
|
||||
pxor %xmm4,%xmm4
|
||||
pxor %xmm5,%xmm5
|
||||
pxor %xmm6,%xmm6
|
||||
pxor %xmm7,%xmm7
|
||||
pxor %xmm8,%xmm8
|
||||
pxor %xmm9,%xmm9
|
||||
pxor %xmm10,%xmm10
|
||||
pxor %xmm11,%xmm11
|
||||
pxor %xmm12,%xmm12
|
||||
pxor %xmm13,%xmm13
|
||||
pxor %xmm14,%xmm14
|
||||
pxor %xmm15,%xmm15
|
||||
xor %rcx,%rcx
|
||||
xor %rdx,%rdx
|
||||
xor %rsi,%rsi
|
||||
xor %rdi,%rdi
|
||||
xor %r8,%r8
|
||||
xor %r9,%r9
|
||||
xor %r10,%r10
|
||||
xor %r11,%r11
|
||||
lea 8(%rsp),%rax
|
||||
ret
|
||||
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
|
||||
|
||||
.globl OPENSSL_ia32_cpuid
|
||||
.align 16
|
||||
OPENSSL_ia32_cpuid:
|
||||
mov %rbx,%r8
|
||||
mov \$1,%eax
|
||||
cpuid
|
||||
shl \$32,%rcx
|
||||
mov %edx,%eax
|
||||
mov %r8,%rbx
|
||||
or %rcx,%rax
|
||||
ret
|
||||
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
|
||||
|
||||
.section .init
|
||||
call OPENSSL_cpuid_setup
|
||||
.align 16
|
||||
___
|
||||
|
@ -161,7 +161,7 @@ sub main'shl { &out2("sall",@_); }
|
||||
sub main'shr { &out2("shrl",@_); }
|
||||
sub main'xor { &out2("xorl",@_); }
|
||||
sub main'xorb { &out2("xorb",@_); }
|
||||
sub main'add { &out2("addl",@_); }
|
||||
sub main'add { &out2($_[0]=~/%[a-d][lh]/?"addb":"addl",@_); }
|
||||
sub main'adc { &out2("adcl",@_); }
|
||||
sub main'sub { &out2("subl",@_); }
|
||||
sub main'sbb { &out2("sbbl",@_); }
|
||||
@ -189,7 +189,7 @@ sub main'jc { &out1("jc",@_); }
|
||||
sub main'jnc { &out1("jnc",@_); }
|
||||
sub main'jno { &out1("jno",@_); }
|
||||
sub main'dec { &out1("decl",@_); }
|
||||
sub main'inc { &out1("incl",@_); }
|
||||
sub main'inc { &out1($_[0]=~/%[a-d][hl]/?"incb":"incl",@_); }
|
||||
sub main'push { &out1("pushl",@_); $stack+=4; }
|
||||
sub main'pop { &out1("popl",@_); $stack-=4; }
|
||||
sub main'pushf { &out0("pushfl"); $stack+=4; }
|
||||
@ -205,9 +205,10 @@ sub main'nop { &out0("nop"); }
|
||||
sub main'test { &out2("testl",@_); }
|
||||
sub main'bt { &out2("btl",@_); }
|
||||
sub main'leave { &out0("leave"); }
|
||||
sub main'cpuid { &out0(".byte 0x0f; .byte 0xa2"); }
|
||||
sub main'rdtsc { &out0(".byte 0x0f; .byte 0x31"); }
|
||||
sub main'cpuid { &out0(".byte\t0x0f,0xa2"); }
|
||||
sub main'rdtsc { &out0(".byte\t0x0f,0x31"); }
|
||||
sub main'halt { &out0("hlt"); }
|
||||
sub main'movz { &out2("movzb",@_); }
|
||||
|
||||
# SSE2
|
||||
sub main'emms { &out0("emms"); }
|
||||
@ -558,7 +559,7 @@ sub main'file_end
|
||||
pushl %ebx
|
||||
movl %edx,%edi
|
||||
movl \$1,%eax
|
||||
.byte 0x0f; .byte 0xa2
|
||||
.byte 0x0f,0xa2
|
||||
orl \$1<<10,%edx
|
||||
movl %edx,0(%edi)
|
||||
popl %ebx
|
||||
|
@ -7,10 +7,10 @@ require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],"rc4-586.pl");
|
||||
|
||||
$tx="eax";
|
||||
$ty="ebx";
|
||||
$x="ecx";
|
||||
$y="edx";
|
||||
$x="eax";
|
||||
$y="ebx";
|
||||
$tx="ecx";
|
||||
$ty="edx";
|
||||
$in="esi";
|
||||
$out="edi";
|
||||
$d="ebp";
|
||||
@ -31,7 +31,7 @@ sub RC4_loop
|
||||
{
|
||||
&mov($ty, &swtmp(2));
|
||||
&cmp($ty, $in);
|
||||
&jle(&label("finished"));
|
||||
&jbe(&label("finished"));
|
||||
&inc($in);
|
||||
}
|
||||
else
|
||||
@ -39,7 +39,7 @@ sub RC4_loop
|
||||
&add($ty, 8);
|
||||
&inc($in);
|
||||
&cmp($ty, $in);
|
||||
&jl(&label("finished"));
|
||||
&jb(&label("finished"));
|
||||
&mov(&swtmp(2), $ty);
|
||||
}
|
||||
}
|
||||
@ -88,35 +88,44 @@ sub RC4
|
||||
|
||||
&function_begin_B($name,"");
|
||||
|
||||
&mov($ty,&wparam(1)); # len
|
||||
&cmp($ty,0);
|
||||
&jne(&label("proceed"));
|
||||
&ret();
|
||||
&set_label("proceed");
|
||||
|
||||
&comment("");
|
||||
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
&mov( $d, &wparam(0)); # key
|
||||
&mov( $ty, &wparam(1)); # num
|
||||
&push("esi");
|
||||
&push("edi");
|
||||
&mov( $d, &wparam(0)); # key
|
||||
&mov( $in, &wparam(2));
|
||||
|
||||
&mov( $x, &DWP(0,$d,"",1));
|
||||
&mov( $y, &DWP(4,$d,"",1));
|
||||
|
||||
&mov( $in, &wparam(2));
|
||||
&mov( $out, &wparam(3));
|
||||
&inc( $x);
|
||||
|
||||
&stack_push(3); # 3 temp variables
|
||||
&add( $d, 8);
|
||||
&and( $x, 0xff);
|
||||
|
||||
# detect compressed schedule, see commentary section in rc4_skey.c...
|
||||
&cmp(&DWP(256,$d),-1);
|
||||
&je(&label("RC4_CHAR"));
|
||||
|
||||
&lea( $ty, &DWP(-8,$ty,$in));
|
||||
|
||||
# check for 0 length input
|
||||
|
||||
&mov( $out, &wparam(3));
|
||||
&mov( &swtmp(2), $ty); # this is now address to exit at
|
||||
&mov( $tx, &DWP(0,$d,$x,4));
|
||||
|
||||
&cmp( $ty, $in);
|
||||
&jl( &label("end")); # less than 8 bytes
|
||||
&jb( &label("end")); # less than 8 bytes
|
||||
|
||||
&set_label("start");
|
||||
|
||||
@ -148,7 +157,7 @@ sub RC4
|
||||
&mov( &DWP(-4,$out,"",0), $tx);
|
||||
&mov( $tx, &DWP(0,$d,$x,4));
|
||||
&cmp($in, $ty);
|
||||
&jle(&label("start"));
|
||||
&jbe(&label("start"));
|
||||
|
||||
&set_label("end");
|
||||
|
||||
@ -162,6 +171,32 @@ sub RC4
|
||||
&RC4_loop(5,0,1);
|
||||
&RC4_loop(6,1,1);
|
||||
|
||||
&jmp(&label("finished"));
|
||||
|
||||
&align(16);
|
||||
# this is essentially Intel P4 specific codepath, see rc4_skey.c...
|
||||
&set_label("RC4_CHAR");
|
||||
|
||||
&lea ($ty,&DWP(0,$in,$ty));
|
||||
&mov (&swtmp(2),$ty);
|
||||
|
||||
# strangely enough unrolled loop performs over 20% slower...
|
||||
&set_label("RC4_CHAR_loop");
|
||||
&movz ($tx,&BP(0,$d,$x));
|
||||
&add (&LB($y),&LB($tx));
|
||||
&movz ($ty,&BP(0,$d,$y));
|
||||
&movb (&BP(0,$d,$y),&LB($tx));
|
||||
&movb (&BP(0,$d,$x),&LB($ty));
|
||||
&add (&LB($ty),&LB($tx));
|
||||
&movz ($ty,&BP(0,$d,$ty));
|
||||
&xorb (&LB($ty),&BP(0,$in));
|
||||
&movb (&BP(0,$out),&LB($ty));
|
||||
&inc (&LB($x));
|
||||
&inc ($in);
|
||||
&inc ($out);
|
||||
&cmp ($in,&swtmp(2));
|
||||
&jb (&label("RC4_CHAR_loop"));
|
||||
|
||||
&set_label("finished");
|
||||
&dec( $x);
|
||||
&stack_pop(3);
|
||||
|
@ -25,6 +25,13 @@
|
||||
# Latter means that if you want to *estimate* what to expect from
|
||||
# *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz.
|
||||
|
||||
# Intel P4 EM64T core was found to run the AMD64 code really slow...
|
||||
# The only way to achieve comparable performance on P4 is to keep
|
||||
# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
|
||||
# compose blended code, which would perform even within 30% marginal
|
||||
# on either AMD and Intel platforms, I implement both cases. See
|
||||
# rc4_skey.c for further details...
|
||||
|
||||
$output=shift;
|
||||
|
||||
$win64a=1 if ($output =~ /win64a.[s|asm]/);
|
||||
@ -90,6 +97,8 @@ $code.=<<___;
|
||||
add \$8,$dat
|
||||
movl `&PTR("DWORD:-8[$dat]")`,$XX#d
|
||||
movl `&PTR("DWORD:-4[$dat]")`,$YY#d
|
||||
cmpl \$-1,`&PTR("DWORD:256[$dat]")`
|
||||
je .LRC4_CHAR
|
||||
test \$-8,$len
|
||||
jz .Lloop1
|
||||
.align 16
|
||||
@ -167,6 +176,24 @@ $code.=<<___;
|
||||
dec $len
|
||||
jnz .Lloop1
|
||||
jmp .Lexit
|
||||
|
||||
.align 16
|
||||
.LRC4_CHAR:
|
||||
inc $XX#b
|
||||
movzb `&PTR("BYTE:[$dat+$XX]")`,$TX#d
|
||||
add $TX#b,$YY#b
|
||||
movzb `&PTR("BYTE:[$dat+$YY]")`,$TY#d
|
||||
movb $TX#b,`&PTR("BYTE:[$dat+$YY]")`
|
||||
movb $TY#b,`&PTR("BYTE:[$dat+$XX]")`
|
||||
add $TX#b,$TY#b
|
||||
movzb `&PTR("BYTE:[$dat+$TY]")`,$TY#d
|
||||
xorb `&PTR("BYTE:[$inp]")`,$TY#b
|
||||
movb $TY#b,`&PTR("BYTE:[$out]")`
|
||||
inc $inp
|
||||
inc $out
|
||||
dec $len
|
||||
jnz .LRC4_CHAR
|
||||
jmp .Lexit
|
||||
___
|
||||
$code.=<<___ if (defined($win64a));
|
||||
RC4 ENDP
|
||||
@ -189,6 +216,8 @@ if (defined($win64a)) {
|
||||
$code =~ s/mov[bwlq]/mov/gm;
|
||||
$code =~ s/movzb/movzx/gm;
|
||||
$code =~ s/repret/DB\t0F3h,0C3h/gm;
|
||||
$code =~ s/cmpl/cmp/gm;
|
||||
$code =~ s/xorb/xor/gm;
|
||||
} else {
|
||||
$code =~ s/([QD]*WORD|BYTE)://gm;
|
||||
$code =~ s/repret/.byte\t0xF3,0xC3/gm;
|
||||
|
@ -1,4 +1,5 @@
|
||||
#ifndef HEADER_RC4_LOCL_H
|
||||
#define HEADER_RC4_LOCL_H
|
||||
#include <openssl/opensslconf.h>
|
||||
#include <cryptlib.h>
|
||||
#endif
|
||||
|
@ -93,25 +93,58 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
|
||||
unsigned int i;
|
||||
|
||||
d= &(key->data[0]);
|
||||
for (i=0; i<256; i++)
|
||||
d[i]=i;
|
||||
key->x = 0;
|
||||
key->y = 0;
|
||||
id1=id2=0;
|
||||
|
||||
#define SK_LOOP(n) { \
|
||||
#define SK_LOOP(d,n) { \
|
||||
tmp=d[(n)]; \
|
||||
id2 = (data[id1] + tmp + id2) & 0xff; \
|
||||
if (++id1 == len) id1=0; \
|
||||
d[(n)]=d[id2]; \
|
||||
d[id2]=tmp; }
|
||||
|
||||
#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM)
|
||||
# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
|
||||
defined(__INTEL__) || \
|
||||
defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64)
|
||||
if (sizeof(RC4_INT) > 1) {
|
||||
/*
|
||||
* Unlike all other x86 [and x86_64] implementations,
|
||||
* Intel P4 core [including EM64T] was found to perform
|
||||
* poorly with wider RC4_INT. Performance improvement
|
||||
* for IA-32 hand-coded assembler turned out to be 2.8x
|
||||
* if re-coded for RC4_CHAR! It's however inappropriate
|
||||
* to just switch to RC4_CHAR for x86[_64], as non-P4
|
||||
* implementations suffer from significant performance
|
||||
* losses then, e.g. PIII exhibits >2x deterioration,
|
||||
* and so does Opteron. In order to assure optimal
|
||||
* all-round performance, let us [try to] detect P4 at
|
||||
* run-time by checking upon HTT bit in CPU capability
|
||||
* vector and set up compressed key schedule, which is
|
||||
* recognized by correspondingly updated assembler
|
||||
* module...
|
||||
* <appro@fy.chalmers.se>
|
||||
*/
|
||||
if (OPENSSL_ia32cap_P & (1<<28)) {
|
||||
unsigned char *cp=(unsigned char *)d;
|
||||
|
||||
for (i=0;i<256;i++) cp[i]=i;
|
||||
for (i=0;i<256;i++) SK_LOOP(cp,i);
|
||||
/* mark schedule as compressed! */
|
||||
d[256/sizeof(RC4_INT)]=-1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
# endif
|
||||
#endif
|
||||
for (i=0; i < 256; i++) d[i]=i;
|
||||
for (i=0; i < 256; i+=4)
|
||||
{
|
||||
SK_LOOP(i+0);
|
||||
SK_LOOP(i+1);
|
||||
SK_LOOP(i+2);
|
||||
SK_LOOP(i+3);
|
||||
SK_LOOP(d,i+0);
|
||||
SK_LOOP(d,i+1);
|
||||
SK_LOOP(d,i+2);
|
||||
SK_LOOP(d,i+3);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -14,11 +14,12 @@ OPENSSL_ia32cap
|
||||
Value returned by OPENSSL_ia32cap_loc() is address of a variable
|
||||
containing IA-32 processor capabilities bit vector as it appears in EDX
|
||||
register after executing CPUID instruction with EAX=1 input value (see
|
||||
Intel Application Note #241618). Naturally it's meaningful on IA-32
|
||||
Intel Application Note #241618). Naturally it's meaningful on IA-32[E]
|
||||
platforms only. The variable is normally set up automatically upon
|
||||
toolkit initialization, but can be manipulated afterwards to modify
|
||||
crypto library behaviour. For the moment of this writing only two bits
|
||||
are significant, namely bit #26 denoting SSE2 support, and bit #4
|
||||
crypto library behaviour. For the moment of this writing three bits are
|
||||
significant, namely bit #28 denoting Hyperthreading, which is used to
|
||||
distinguish Intel P4 core, bit #26 denoting SSE2 support, and bit #4
|
||||
denoting presence of Time-Stamp Counter. Clearing bit #26 at run-time
|
||||
for example disables high-performance SSE2 code present in the crypto
|
||||
library. You might have to do this if target OpenSSL application is
|
||||
|
Loading…
x
Reference in New Issue
Block a user