x86[_64]cpuid.pl: handle new extensions.
This commit is contained in:
parent
a3e07010b4
commit
b906422149
@ -47,7 +47,7 @@ OPENSSL_rdtsc:
|
|||||||
.type OPENSSL_ia32_cpuid,\@abi-omnipotent
|
.type OPENSSL_ia32_cpuid,\@abi-omnipotent
|
||||||
.align 16
|
.align 16
|
||||||
OPENSSL_ia32_cpuid:
|
OPENSSL_ia32_cpuid:
|
||||||
mov %rbx,%r8
|
mov %rbx,%r8 # save %rbx
|
||||||
|
|
||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
cpuid
|
cpuid
|
||||||
@ -79,7 +79,15 @@ OPENSSL_ia32_cpuid:
|
|||||||
# AMD specific
|
# AMD specific
|
||||||
mov \$0x80000000,%eax
|
mov \$0x80000000,%eax
|
||||||
cpuid
|
cpuid
|
||||||
cmp \$0x80000008,%eax
|
cmp \$0x80000001,%eax
|
||||||
|
jb .Lintel
|
||||||
|
mov %eax,%r10d
|
||||||
|
mov \$0x80000001,%eax
|
||||||
|
cpuid
|
||||||
|
or %ecx,%r9d
|
||||||
|
and \$0x00000801,%r9d # isolate AMD XOP bit, 1<<11
|
||||||
|
|
||||||
|
cmp \$0x80000008,%r10d
|
||||||
jb .Lintel
|
jb .Lintel
|
||||||
|
|
||||||
mov \$0x80000008,%eax
|
mov \$0x80000008,%eax
|
||||||
@ -90,12 +98,12 @@ OPENSSL_ia32_cpuid:
|
|||||||
mov \$1,%eax
|
mov \$1,%eax
|
||||||
cpuid
|
cpuid
|
||||||
bt \$28,%edx # test hyper-threading bit
|
bt \$28,%edx # test hyper-threading bit
|
||||||
jnc .Ldone
|
jnc .Lgeneric
|
||||||
shr \$16,%ebx # number of logical processors
|
shr \$16,%ebx # number of logical processors
|
||||||
cmp %r10b,%bl
|
cmp %r10b,%bl
|
||||||
ja .Ldone
|
ja .Lgeneric
|
||||||
and \$0xefffffff,%edx # ~(1<<28)
|
and \$0xefffffff,%edx # ~(1<<28)
|
||||||
jmp .Ldone
|
jmp .Lgeneric
|
||||||
|
|
||||||
.Lintel:
|
.Lintel:
|
||||||
cmp \$4,%r11d
|
cmp \$4,%r11d
|
||||||
@ -121,21 +129,38 @@ OPENSSL_ia32_cpuid:
|
|||||||
or \$0x40000000,%edx # use reserved bit to skip unrolled loop
|
or \$0x40000000,%edx # use reserved bit to skip unrolled loop
|
||||||
.Lnotintel:
|
.Lnotintel:
|
||||||
bt \$28,%edx # test hyper-threading bit
|
bt \$28,%edx # test hyper-threading bit
|
||||||
jnc .Ldone
|
jnc .Lgeneric
|
||||||
and \$0xefffffff,%edx # ~(1<<28)
|
and \$0xefffffff,%edx # ~(1<<28)
|
||||||
cmp \$0,%r10d
|
cmp \$0,%r10d
|
||||||
je .Ldone
|
je .Lgeneric
|
||||||
|
|
||||||
or \$0x10000000,%edx # 1<<28
|
or \$0x10000000,%edx # 1<<28
|
||||||
shr \$16,%ebx
|
shr \$16,%ebx
|
||||||
cmp \$1,%bl # see if cache is shared
|
cmp \$1,%bl # see if cache is shared
|
||||||
ja .Ldone
|
ja .Lgeneric
|
||||||
and \$0xefffffff,%edx # ~(1<<28)
|
and \$0xefffffff,%edx # ~(1<<28)
|
||||||
.Ldone:
|
.Lgeneric:
|
||||||
|
and \$0x00000800,%r9d # isolate AMD XOP flag
|
||||||
|
and \$0xfffff7ff,%ecx
|
||||||
|
or %r9d,%ecx # merge AMD XOP flag
|
||||||
|
|
||||||
shl \$32,%rcx
|
shl \$32,%rcx
|
||||||
mov %edx,%eax
|
mov %edx,%ebx
|
||||||
mov %r8,%rbx
|
or %rcx,%rbx # compose capability vector in %rbx
|
||||||
or %rcx,%rax
|
bt \$27+32,%rcx # check OSXSAVE bit
|
||||||
|
jnc .Lclear_avx
|
||||||
|
xor %ecx,%ecx # XCR0
|
||||||
|
.byte 0x0f,0x01,0xd0 # xgetbv
|
||||||
|
and \$6,%eax # isolate XMM and YMM state support
|
||||||
|
cmp \$6,%eax
|
||||||
|
je .Ldone
|
||||||
|
.Lclear_avx:
|
||||||
|
mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11)
|
||||||
|
shl \$32,%rax
|
||||||
|
and %rax,%rbx # clear AVX, FMA and AMD XOP bits
|
||||||
|
.Ldone:
|
||||||
|
mov %rbx,%rax
|
||||||
|
mov %r8,%rbx # restore %rbx
|
||||||
ret
|
ret
|
||||||
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
|
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
|
||||||
|
|
||||||
@ -250,7 +275,7 @@ OPENSSL_instrument_bus:
|
|||||||
mov %eax,$lasttick # lasttick = tick
|
mov %eax,$lasttick # lasttick = tick
|
||||||
mov \$0,$lastdiff # lastdiff = 0
|
mov \$0,$lastdiff # lastdiff = 0
|
||||||
clflush ($out)
|
clflush ($out)
|
||||||
lock
|
.byte 0xf0 # lock
|
||||||
add $lastdiff,($out)
|
add $lastdiff,($out)
|
||||||
jmp .Loop
|
jmp .Loop
|
||||||
.align 16
|
.align 16
|
||||||
@ -260,7 +285,7 @@ OPENSSL_instrument_bus:
|
|||||||
mov %edx,$lasttick
|
mov %edx,$lasttick
|
||||||
mov %eax,$lastdiff
|
mov %eax,$lastdiff
|
||||||
clflush ($out)
|
clflush ($out)
|
||||||
lock
|
.byte 0xf0 # lock
|
||||||
add %eax,($out)
|
add %eax,($out)
|
||||||
lea 4($out),$out
|
lea 4($out),$out
|
||||||
sub \$1,$cnt
|
sub \$1,$cnt
|
||||||
@ -284,7 +309,7 @@ OPENSSL_instrument_bus2:
|
|||||||
mov \$0,$lastdiff # lastdiff = 0
|
mov \$0,$lastdiff # lastdiff = 0
|
||||||
|
|
||||||
clflush ($out)
|
clflush ($out)
|
||||||
lock
|
.byte 0xf0 # lock
|
||||||
add $lastdiff,($out)
|
add $lastdiff,($out)
|
||||||
|
|
||||||
rdtsc # collect 1st diff
|
rdtsc # collect 1st diff
|
||||||
@ -294,7 +319,7 @@ OPENSSL_instrument_bus2:
|
|||||||
mov %eax,$lastdiff # lastdiff = diff
|
mov %eax,$lastdiff # lastdiff = diff
|
||||||
.Loop2:
|
.Loop2:
|
||||||
clflush ($out)
|
clflush ($out)
|
||||||
lock
|
.byte 0xf0 # lock
|
||||||
add %eax,($out) # accumulate diff
|
add %eax,($out) # accumulate diff
|
||||||
|
|
||||||
sub \$1,$max
|
sub \$1,$max
|
||||||
|
@ -20,7 +20,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
|||||||
&pop ("eax");
|
&pop ("eax");
|
||||||
&xor ("ecx","eax");
|
&xor ("ecx","eax");
|
||||||
&bt ("ecx",21);
|
&bt ("ecx",21);
|
||||||
&jnc (&label("done"));
|
&jnc (&label("generic"));
|
||||||
&xor ("eax","eax");
|
&xor ("eax","eax");
|
||||||
&cpuid ();
|
&cpuid ();
|
||||||
&mov ("edi","eax"); # max value for standard query level
|
&mov ("edi","eax"); # max value for standard query level
|
||||||
@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
|||||||
# AMD specific
|
# AMD specific
|
||||||
&mov ("eax",0x80000000);
|
&mov ("eax",0x80000000);
|
||||||
&cpuid ();
|
&cpuid ();
|
||||||
&cmp ("eax",0x80000008);
|
&cmp ("eax",0x80000001);
|
||||||
|
&jb (&label("intel"));
|
||||||
|
&mov ("esi","eax");
|
||||||
|
&mov ("eax",0x80000001);
|
||||||
|
&cpuid ();
|
||||||
|
&or ("ebp","ecx");
|
||||||
|
&and ("ebp",1<<11|1); # isolate XOP bit
|
||||||
|
&cmp ("esi",0x80000008);
|
||||||
&jb (&label("intel"));
|
&jb (&label("intel"));
|
||||||
|
|
||||||
&mov ("eax",0x80000008);
|
&mov ("eax",0x80000008);
|
||||||
@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
|||||||
&mov ("eax",1);
|
&mov ("eax",1);
|
||||||
&cpuid ();
|
&cpuid ();
|
||||||
&bt ("edx",28);
|
&bt ("edx",28);
|
||||||
&jnc (&label("done"));
|
&jnc (&label("generic"));
|
||||||
&shr ("ebx",16);
|
&shr ("ebx",16);
|
||||||
&and ("ebx",0xff);
|
&and ("ebx",0xff);
|
||||||
&cmp ("ebx","esi");
|
&cmp ("ebx","esi");
|
||||||
&ja (&label("done"));
|
&ja (&label("generic"));
|
||||||
&and ("edx",0xefffffff); # clear hyper-threading bit
|
&and ("edx",0xefffffff); # clear hyper-threading bit
|
||||||
&jmp (&label("done"));
|
&jmp (&label("generic"));
|
||||||
|
|
||||||
&set_label("intel");
|
&set_label("intel");
|
||||||
&cmp ("edi",4);
|
&cmp ("edi",4);
|
||||||
@ -93,19 +100,42 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
|||||||
&or ("edx",1<<20); # use reserved bit to engage RC4_CHAR
|
&or ("edx",1<<20); # use reserved bit to engage RC4_CHAR
|
||||||
&set_label("notP4");
|
&set_label("notP4");
|
||||||
&bt ("edx",28); # test hyper-threading bit
|
&bt ("edx",28); # test hyper-threading bit
|
||||||
&jnc (&label("done"));
|
&jnc (&label("generic"));
|
||||||
&and ("edx",0xefffffff);
|
&and ("edx",0xefffffff);
|
||||||
&cmp ("edi",0);
|
&cmp ("edi",0);
|
||||||
&je (&label("done"));
|
&je (&label("generic"));
|
||||||
|
|
||||||
&or ("edx",0x10000000);
|
&or ("edx",0x10000000);
|
||||||
&shr ("ebx",16);
|
&shr ("ebx",16);
|
||||||
&cmp (&LB("ebx"),1);
|
&cmp (&LB("ebx"),1);
|
||||||
&ja (&label("done"));
|
&ja (&label("generic"));
|
||||||
&and ("edx",0xefffffff); # clear hyper-threading bit if not
|
&and ("edx",0xefffffff); # clear hyper-threading bit if not
|
||||||
|
|
||||||
|
&set_label("generic");
|
||||||
|
&and ("ebp",1<<11); # isolate AMD XOP flag
|
||||||
|
&and ("ecx",~(1<<11));
|
||||||
|
&mov ("esi","edx");
|
||||||
|
&or ("ebp","ecx"); # merge AMD XOP flag
|
||||||
|
|
||||||
|
&bt ("ecx",26); # check XSAVE bit
|
||||||
|
&jnc (&label("done"));
|
||||||
|
&bt ("ecx",27); # check OSXSAVE bit
|
||||||
|
&jnc (&label("clear_xmm"));
|
||||||
|
&xor ("ecx","ecx");
|
||||||
|
&data_byte(0x0f,0x01,0xd0); # xgetbv
|
||||||
|
&and ("eax",6);
|
||||||
|
&cmp ("eax",6);
|
||||||
|
&je (&label("done"));
|
||||||
|
&cmp ("eax",2);
|
||||||
|
&je (&label("clear_avx"));
|
||||||
|
&set_label("clear_xmm");
|
||||||
|
&and ("ebp",~(1<<25|1<<1)); # clear AESNI and PCLMULQDQ bits
|
||||||
|
&and ("esi",~(1<<24)); # clear FXSR
|
||||||
|
&set_label("clear_avx");
|
||||||
|
&and ("ebp",~(1<<28|1<<12|1<<11));# clear AVX, FMA and AMD XOP bits
|
||||||
&set_label("done");
|
&set_label("done");
|
||||||
&mov ("eax","edx");
|
&mov ("eax","esi");
|
||||||
&mov ("edx","ecx");
|
&mov ("edx","ebp");
|
||||||
&function_end("OPENSSL_ia32_cpuid");
|
&function_end("OPENSSL_ia32_cpuid");
|
||||||
|
|
||||||
&external_label("OPENSSL_ia32cap_P");
|
&external_label("OPENSSL_ia32cap_P");
|
||||||
@ -199,8 +229,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
|||||||
&bt (&DWP(0,"ecx"),1);
|
&bt (&DWP(0,"ecx"),1);
|
||||||
&jnc (&label("no_x87"));
|
&jnc (&label("no_x87"));
|
||||||
if ($sse2) {
|
if ($sse2) {
|
||||||
&bt (&DWP(0,"ecx"),26);
|
&and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits
|
||||||
&jnc (&label("no_sse2"));
|
&cmp ("ecx",1<<26|1<<24);
|
||||||
|
&jne (&label("no_sse2"));
|
||||||
&pxor ("xmm0","xmm0");
|
&pxor ("xmm0","xmm0");
|
||||||
&pxor ("xmm1","xmm1");
|
&pxor ("xmm1","xmm1");
|
||||||
&pxor ("xmm2","xmm2");
|
&pxor ("xmm2","xmm2");
|
||||||
@ -331,7 +362,7 @@ my $max = "ebp";
|
|||||||
&mov ($lasttick,"eax"); # lasttick = tick
|
&mov ($lasttick,"eax"); # lasttick = tick
|
||||||
&mov ($lastdiff,0); # lastdiff = 0
|
&mov ($lastdiff,0); # lastdiff = 0
|
||||||
&clflush(&DWP(0,$out));
|
&clflush(&DWP(0,$out));
|
||||||
&lock ();
|
&data_byte(0xf0); # lock
|
||||||
&add (&DWP(0,$out),$lastdiff);
|
&add (&DWP(0,$out),$lastdiff);
|
||||||
&jmp (&label("loop"));
|
&jmp (&label("loop"));
|
||||||
|
|
||||||
@ -342,7 +373,7 @@ my $max = "ebp";
|
|||||||
&mov ($lasttick,"edx"); # lasttick = tick
|
&mov ($lasttick,"edx"); # lasttick = tick
|
||||||
&mov ($lastdiff,"eax"); # lastdiff = diff
|
&mov ($lastdiff,"eax"); # lastdiff = diff
|
||||||
&clflush(&DWP(0,$out));
|
&clflush(&DWP(0,$out));
|
||||||
&lock ();
|
&data_byte(0xf0); # lock
|
||||||
&add (&DWP(0,$out),"eax"); # accumulate diff
|
&add (&DWP(0,$out),"eax"); # accumulate diff
|
||||||
&lea ($out,&DWP(4,$out)); # ++$out
|
&lea ($out,&DWP(4,$out)); # ++$out
|
||||||
&sub ($cnt,1); # --$cnt
|
&sub ($cnt,1); # --$cnt
|
||||||
@ -371,7 +402,7 @@ my $max = "ebp";
|
|||||||
&mov ($lastdiff,0); # lastdiff = 0
|
&mov ($lastdiff,0); # lastdiff = 0
|
||||||
|
|
||||||
&clflush(&DWP(0,$out));
|
&clflush(&DWP(0,$out));
|
||||||
&lock ();
|
&data_byte(0xf0); # lock
|
||||||
&add (&DWP(0,$out),$lastdiff);
|
&add (&DWP(0,$out),$lastdiff);
|
||||||
|
|
||||||
&rdtsc (); # collect 1st diff
|
&rdtsc (); # collect 1st diff
|
||||||
@ -383,7 +414,7 @@ my $max = "ebp";
|
|||||||
|
|
||||||
&set_label("loop2",16);
|
&set_label("loop2",16);
|
||||||
&clflush(&DWP(0,$out));
|
&clflush(&DWP(0,$out));
|
||||||
&lock ();
|
&data_byte(0xf0); # lock
|
||||||
&add (&DWP(0,$out),"eax"); # accumulate diff
|
&add (&DWP(0,$out),"eax"); # accumulate diff
|
||||||
|
|
||||||
&sub ($max,1);
|
&sub ($max,1);
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
=head1 NAME
|
=head1 NAME
|
||||||
|
|
||||||
OPENSSL_ia32cap - finding the IA-32 processor capabilities
|
OPENSSL_ia32cap - the IA-32 processor capabilities vector
|
||||||
|
|
||||||
=head1 SYNOPSIS
|
=head1 SYNOPSIS
|
||||||
|
|
||||||
@ -18,30 +18,52 @@ input value (see Intel Application Note #241618). Naturally it's
|
|||||||
meaningful on x86 and x86_64 platforms only. The variable is normally
|
meaningful on x86 and x86_64 platforms only. The variable is normally
|
||||||
set up automatically upon toolkit initialization, but can be
|
set up automatically upon toolkit initialization, but can be
|
||||||
manipulated afterwards to modify crypto library behaviour. For the
|
manipulated afterwards to modify crypto library behaviour. For the
|
||||||
moment of this writing seven bits are significant, namely:
|
moment of this writing following bits are significant:
|
||||||
|
|
||||||
1. bit #4 denoting presence of Time-Stamp Counter.
|
=item bit #4 denoting presence of Time-Stamp Counter.
|
||||||
2. bit #20, reserved by Intel, is used to choose among RC4 code
|
|
||||||
paths;
|
=item bit #19 denoting availability of CLFLUSH instruction;
|
||||||
3. bit #23 denoting MMX support;
|
|
||||||
4. bit #25 denoting SSE support;
|
=item bit #20, reserved by Intel, is used to choose among RC4 code paths;
|
||||||
5. bit #26 denoting SSE2 support;
|
|
||||||
6. bit #28 denoting Hyperthreading, which is used to distiguish
|
=item bit #23 denoting MMX support;
|
||||||
cores with shared cache;
|
|
||||||
7. bit #30, reserved by Intel, is used to choose among RC4 code
|
=item bit #24, FXSR bit, denoting availability of XMM registers;
|
||||||
paths;
|
|
||||||
8. bit #57 denoting Intel AES instruction set extension;
|
=item bit #25 denoting SSE support;
|
||||||
|
|
||||||
|
=item bit #26 denoting SSE2 support;
|
||||||
|
|
||||||
|
=item bit #28 denoting Hyperthreading, which is used to distiguish
|
||||||
|
cores with shared cache;
|
||||||
|
|
||||||
|
=item bit #30, reserved by Intel, is used to choose among RC4 code
|
||||||
|
paths;
|
||||||
|
|
||||||
|
=item bit #33 denoting availability of PCLMULQDQ instruction;
|
||||||
|
|
||||||
|
=item bit #41 denoting SSSE3, Supplemental SSE3, support;
|
||||||
|
|
||||||
|
=item bit #43 denoting AMD XOP support (forced to zero on Intel);
|
||||||
|
|
||||||
|
=item bit #57 denoting AES-NI instruction set extension;
|
||||||
|
|
||||||
|
=item bit #59, OSXSAVE bit, denoting availability of YMM registers;
|
||||||
|
|
||||||
|
=item bit #60 denoting AVX extension;
|
||||||
|
|
||||||
For example, clearing bit #26 at run-time disables high-performance
|
For example, clearing bit #26 at run-time disables high-performance
|
||||||
SSE2 code present in the crypto library. You might have to do this if
|
SSE2 code present in the crypto library, while clearing bit #24
|
||||||
target OpenSSL application is executed on SSE2 capable CPU, but under
|
disables SSE2 code operating on 128-bit XMM register bank. You might
|
||||||
control of OS which does not support SSE2 extentions. Even though you
|
have to do the latter if target OpenSSL application is executed on SSE2
|
||||||
can manipulate the value programmatically, you most likely will find it
|
capable CPU, but under control of OS that does not enable XMM
|
||||||
more appropriate to set up an environment variable with the same name
|
registers. Even though you can manipulate the value programmatically,
|
||||||
prior starting target application, e.g. on Intel P4 processor 'env
|
you most likely will find it more appropriate to set up an environment
|
||||||
OPENSSL_ia32cap=0x12900010 apps/openssl', to achieve same effect
|
variable with the same name prior starting target application, e.g. on
|
||||||
without modifying the application source code. Alternatively you can
|
Intel P4 processor 'env OPENSSL_ia32cap=0x16980010 apps/openssl', to
|
||||||
reconfigure the toolkit with no-sse2 option and recompile.
|
achieve same effect without modifying the application source code.
|
||||||
|
Alternatively you can reconfigure the toolkit with no-sse2 option and
|
||||||
|
recompile.
|
||||||
|
|
||||||
Less intuituve is clearing bit #28. The truth is that it's not copied
|
Less intuituve is clearing bit #28. The truth is that it's not copied
|
||||||
from CPUID output verbatim, but is adjusted to reflect whether or not
|
from CPUID output verbatim, but is adjusted to reflect whether or not
|
||||||
@ -49,4 +71,3 @@ the data cache is actually shared between logical cores. This in turn
|
|||||||
affects the decision on whether or not expensive countermeasures
|
affects the decision on whether or not expensive countermeasures
|
||||||
against cache-timing attacks are applied, most notably in AES assembler
|
against cache-timing attacks are applied, most notably in AES assembler
|
||||||
module.
|
module.
|
||||||
=cut
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user