x86[_64]cpuid.pl: harmonize OPENSSL_ia32_cpuid [from HEAD].
This commit is contained in:
parent
4a46dc6e5c
commit
10fd0b7b55
@ -7,15 +7,24 @@ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| $^X $xlate $flavour $output";
|
||||
|
||||
($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
|
||||
("%rdi","%rsi","%rdx","%rcx"); # Unix order
|
||||
|
||||
if ($win64) { $arg1="%rcx"; $arg2="%rdx"; }
|
||||
else { $arg1="%rdi"; $arg2="%rsi"; }
|
||||
print<<___;
|
||||
.extern OPENSSL_cpuid_setup
|
||||
.hidden OPENSSL_cpuid_setup
|
||||
.section .init
|
||||
call OPENSSL_cpuid_setup
|
||||
|
||||
.hidden OPENSSL_ia32cap_P
|
||||
.comm OPENSSL_ia32cap_P,8
|
||||
|
||||
.text
|
||||
|
||||
.globl OPENSSL_atomic_add
|
||||
@ -46,7 +55,7 @@ OPENSSL_rdtsc:
|
||||
.type OPENSSL_ia32_cpuid,\@abi-omnipotent
|
||||
.align 16
|
||||
OPENSSL_ia32_cpuid:
|
||||
mov %rbx,%r8
|
||||
mov %rbx,%r8 # save %rbx
|
||||
|
||||
xor %eax,%eax
|
||||
cpuid
|
||||
@ -78,7 +87,15 @@ OPENSSL_ia32_cpuid:
|
||||
# AMD specific
|
||||
mov \$0x80000000,%eax
|
||||
cpuid
|
||||
cmp \$0x80000008,%eax
|
||||
cmp \$0x80000001,%eax
|
||||
jb .Lintel
|
||||
mov %eax,%r10d
|
||||
mov \$0x80000001,%eax
|
||||
cpuid
|
||||
or %ecx,%r9d
|
||||
and \$0x00000801,%r9d # isolate AMD XOP bit, 1<<11
|
||||
|
||||
cmp \$0x80000008,%r10d
|
||||
jb .Lintel
|
||||
|
||||
mov \$0x80000008,%eax
|
||||
@ -89,12 +106,12 @@ OPENSSL_ia32_cpuid:
|
||||
mov \$1,%eax
|
||||
cpuid
|
||||
bt \$28,%edx # test hyper-threading bit
|
||||
jnc .Ldone
|
||||
jnc .Lgeneric
|
||||
shr \$16,%ebx # number of logical processors
|
||||
cmp %r10b,%bl
|
||||
ja .Ldone
|
||||
ja .Lgeneric
|
||||
and \$0xefffffff,%edx # ~(1<<28)
|
||||
jmp .Ldone
|
||||
jmp .Lgeneric
|
||||
|
||||
.Lintel:
|
||||
cmp \$4,%r11d
|
||||
@ -111,30 +128,47 @@ OPENSSL_ia32_cpuid:
|
||||
.Lnocacheinfo:
|
||||
mov \$1,%eax
|
||||
cpuid
|
||||
and \$0xbfefffff,%edx # force reserved bits to 0
|
||||
cmp \$0,%r9d
|
||||
jne .Lnotintel
|
||||
or \$0x00100000,%edx # use reserved 20th bit to engage RC4_CHAR
|
||||
or \$0x40000000,%edx # set reserved bit#30 on Intel CPUs
|
||||
and \$15,%ah
|
||||
cmp \$15,%ah # examine Family ID
|
||||
je .Lnotintel
|
||||
or \$0x40000000,%edx # use reserved bit to skip unrolled loop
|
||||
jne .Lnotintel
|
||||
or \$0x00100000,%edx # set reserved bit#20 to engage RC4_CHAR
|
||||
.Lnotintel:
|
||||
bt \$28,%edx # test hyper-threading bit
|
||||
jnc .Ldone
|
||||
jnc .Lgeneric
|
||||
and \$0xefffffff,%edx # ~(1<<28)
|
||||
cmp \$0,%r10d
|
||||
je .Ldone
|
||||
je .Lgeneric
|
||||
|
||||
or \$0x10000000,%edx # 1<<28
|
||||
shr \$16,%ebx
|
||||
cmp \$1,%bl # see if cache is shared
|
||||
ja .Ldone
|
||||
ja .Lgeneric
|
||||
and \$0xefffffff,%edx # ~(1<<28)
|
||||
.Lgeneric:
|
||||
and \$0x00000800,%r9d # isolate AMD XOP flag
|
||||
and \$0xfffff7ff,%ecx
|
||||
or %ecx,%r9d # merge AMD XOP flag
|
||||
|
||||
mov %edx,%r10d # %r9d:%r10d is copy of %ecx:%edx
|
||||
bt \$27,%r9d # check OSXSAVE bit
|
||||
jnc .Lclear_avx
|
||||
xor %ecx,%ecx # XCR0
|
||||
.byte 0x0f,0x01,0xd0 # xgetbv
|
||||
and \$6,%eax # isolate XMM and YMM state support
|
||||
cmp \$6,%eax
|
||||
je .Ldone
|
||||
.Lclear_avx:
|
||||
mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11)
|
||||
and %eax,%r9d # clear AVX, FMA and AMD XOP bits
|
||||
.Ldone:
|
||||
shl \$32,%rcx
|
||||
mov %edx,%eax
|
||||
mov %r8,%rbx
|
||||
or %rcx,%rax
|
||||
shl \$32,%r9
|
||||
mov %r10d,%eax
|
||||
mov %r8,%rbx # restore %rbx
|
||||
or %r9,%rax
|
||||
ret
|
||||
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
|
||||
|
||||
|
@ -20,7 +20,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
&pop ("eax");
|
||||
&xor ("ecx","eax");
|
||||
&bt ("ecx",21);
|
||||
&jnc (&label("done"));
|
||||
&jnc (&label("generic"));
|
||||
&xor ("eax","eax");
|
||||
&cpuid ();
|
||||
&mov ("edi","eax"); # max value for standard query level
|
||||
@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
# AMD specific
|
||||
&mov ("eax",0x80000000);
|
||||
&cpuid ();
|
||||
&cmp ("eax",0x80000008);
|
||||
&cmp ("eax",0x80000001);
|
||||
&jb (&label("intel"));
|
||||
&mov ("esi","eax");
|
||||
&mov ("eax",0x80000001);
|
||||
&cpuid ();
|
||||
&or ("ebp","ecx");
|
||||
&and ("ebp",1<<11|1); # isolate XOP bit
|
||||
&cmp ("esi",0x80000008);
|
||||
&jb (&label("intel"));
|
||||
|
||||
&mov ("eax",0x80000008);
|
||||
@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
&mov ("eax",1);
|
||||
&cpuid ();
|
||||
&bt ("edx",28);
|
||||
&jnc (&label("done"));
|
||||
&jnc (&label("generic"));
|
||||
&shr ("ebx",16);
|
||||
&and ("ebx",0xff);
|
||||
&cmp ("ebx","esi");
|
||||
&ja (&label("done"));
|
||||
&ja (&label("generic"));
|
||||
&and ("edx",0xefffffff); # clear hyper-threading bit
|
||||
&jmp (&label("done"));
|
||||
&jmp (&label("generic"));
|
||||
|
||||
&set_label("intel");
|
||||
&cmp ("edi",4);
|
||||
@ -85,27 +92,52 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
&set_label("nocacheinfo");
|
||||
&mov ("eax",1);
|
||||
&cpuid ();
|
||||
&and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0
|
||||
&cmp ("ebp",0);
|
||||
&jne (&label("notP4"));
|
||||
&jne (&label("notintel"));
|
||||
&or ("edx",1<<30); # set reserved bit#30 on Intel CPUs
|
||||
&and (&HB("eax"),15); # familiy ID
|
||||
&cmp (&HB("eax"),15); # P4?
|
||||
&jne (&label("notP4"));
|
||||
&or ("edx",1<<20); # use reserved bit to engage RC4_CHAR
|
||||
&set_label("notP4");
|
||||
&jne (&label("notintel"));
|
||||
&or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR
|
||||
&set_label("notintel");
|
||||
&bt ("edx",28); # test hyper-threading bit
|
||||
&jnc (&label("done"));
|
||||
&jnc (&label("generic"));
|
||||
&and ("edx",0xefffffff);
|
||||
&cmp ("edi",0);
|
||||
&je (&label("done"));
|
||||
&je (&label("generic"));
|
||||
|
||||
&or ("edx",0x10000000);
|
||||
&shr ("ebx",16);
|
||||
&cmp (&LB("ebx"),1);
|
||||
&ja (&label("done"));
|
||||
&ja (&label("generic"));
|
||||
&and ("edx",0xefffffff); # clear hyper-threading bit if not
|
||||
|
||||
&set_label("generic");
|
||||
&and ("ebp",1<<11); # isolate AMD XOP flag
|
||||
&and ("ecx",0xfffff7ff); # force 11th bit to 0
|
||||
&mov ("esi","edx");
|
||||
&or ("ebp","ecx"); # merge AMD XOP flag
|
||||
|
||||
&bt ("ecx",26); # check XSAVE bit
|
||||
&jnc (&label("done"));
|
||||
&bt ("ecx",27); # check OSXSAVE bit
|
||||
&jnc (&label("clear_xmm"));
|
||||
&xor ("ecx","ecx");
|
||||
&data_byte(0x0f,0x01,0xd0); # xgetbv
|
||||
&and ("eax",6);
|
||||
&cmp ("eax",6);
|
||||
&je (&label("done"));
|
||||
&cmp ("eax",2);
|
||||
&je (&label("clear_avx"));
|
||||
&set_label("clear_xmm");
|
||||
&and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits
|
||||
&and ("esi",0xfeffffff); # clear FXSR
|
||||
&set_label("clear_avx");
|
||||
&and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits
|
||||
&set_label("done");
|
||||
&mov ("eax","edx");
|
||||
&mov ("edx","ecx");
|
||||
&mov ("eax","esi");
|
||||
&mov ("edx","ebp");
|
||||
&function_end("OPENSSL_ia32_cpuid");
|
||||
|
||||
&external_label("OPENSSL_ia32cap_P");
|
||||
@ -199,8 +231,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
&bt (&DWP(0,"ecx"),1);
|
||||
&jnc (&label("no_x87"));
|
||||
if ($sse2) {
|
||||
&bt (&DWP(0,"ecx"),26);
|
||||
&jnc (&label("no_sse2"));
|
||||
&and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits
|
||||
&cmp ("ecx",1<<26|1<<24);
|
||||
&jne (&label("no_sse2"));
|
||||
&pxor ("xmm0","xmm0");
|
||||
&pxor ("xmm1","xmm1");
|
||||
&pxor ("xmm2","xmm2");
|
||||
|
Loading…
x
Reference in New Issue
Block a user