diff --git a/include/multibinary.asm b/include/multibinary.asm index cef0a85..f41544d 100644 --- a/include/multibinary.asm +++ b/include/multibinary.asm @@ -239,7 +239,7 @@ ret %endmacro -%ifdef HAVE_AS_KNOWS_AVX512 +%if AS_FEATURE_LEVEL >= 6 ;;;;; ; mbin_dispatch_init6 parameters ; 1-> function name @@ -293,8 +293,8 @@ and edi, FLAG_XGETBV_EAX_ZMM_OPM cmp edi, FLAG_XGETBV_EAX_ZMM_OPM jne _%1_init_done ; No AVX512 possible - and ebx, FLAGS_CPUID7_ECX_AVX512_G1 - cmp ebx, FLAGS_CPUID7_ECX_AVX512_G1 + and ebx, FLAGS_CPUID7_EBX_AVX512_G1 + cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt cmove mbin_rsi, mbin_rbx @@ -315,4 +315,85 @@ %endmacro %endif +%if AS_FEATURE_LEVEL >= 10 +;;;;; +; mbin_dispatch_init7 parameters +; 1-> function name +; 2-> base function +; 3-> SSE4_2 or 00/01 optimized function +; 4-> AVX/02 opt func +; 5-> AVX2/04 opt func +; 6-> AVX512/06 opt func +; 7-> AVX512 Update/10 opt func +;;;;; +%macro mbin_dispatch_init7 7 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + push mbin_rdi + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + mov ebx, ecx ; save cpuid1.ecx + test ecx, FLAG_CPUID1_ECX_SSE4_2 + je _%1_init_done ; Use base function if no SSE4_2 + lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt + + ;; Test for XMM_YMM support/AVX + test ecx, FLAG_CPUID1_ECX_OSXSAVE + je _%1_init_done + xor ecx, ecx + xgetbv ; xcr -> edx:eax + mov edi, eax ; save xgetvb.eax + + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + jne _%1_init_done + test ebx, FLAG_CPUID1_ECX_AVX + je _%1_init_done + lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt + + ;; Test for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID7_EBX_AVX2 + je _%1_init_done ; No AVX2 possible + lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func + + ;; Test for AVX512 + and edi, FLAG_XGETBV_EAX_ZMM_OPM + cmp edi, FLAG_XGETBV_EAX_ZMM_OPM + jne _%1_init_done ; No AVX512 possible + and ebx, FLAGS_CPUID7_EBX_AVX512_G1 + cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 + lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt + cmove mbin_rsi, mbin_rbx + + and ecx, FLAGS_CPUID7_ECX_AVX512_G2 + cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2 + lea mbin_rbx, [%7 WRT_OPT] ; AVX512/06 opt + cmove mbin_rsi, mbin_rbx + + _%1_init_done: + pop mbin_rdi + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro +%else +%macro mbin_dispatch_init7 7 + mbin_dispatch_init5 %1, %2, %3, %4, %5 +%endmacro +%endif + %endif ; ifndef _MULTIBINARY_ASM_ diff --git a/include/reg_sizes.asm b/include/reg_sizes.asm index 7a990f7..fec6a8a 100644 --- a/include/reg_sizes.asm +++ b/include/reg_sizes.asm @@ -38,6 +38,10 @@ %endif %endif +%ifndef AS_FEATURE_LEVEL +%define AS_FEATURE_LEVEL 4 +%endif + %define EFLAGS_HAS_CPUID (1<<21) %define FLAG_CPUID1_ECX_CLMUL (1<<1) %define FLAG_CPUID1_EDX_SSE2 (1<<26) @@ -59,9 +63,18 @@ %define FLAG_CPUID7_EBX_AVX512CD (1<<28) %define FLAG_CPUID7_EBX_AVX512BW (1<<30) %define FLAG_CPUID7_EBX_AVX512VL (1<<31) -%define FLAG_CPUID7_ECX_AVX512VBMI (1<<1) -%define FLAGS_CPUID7_ECX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ) +%define FLAG_CPUID7_ECX_AVX512VBMI (1<<1) +%define FLAG_CPUID7_ECX_AVX512VBMI2 (1 << 6) +%define FLAG_CPUID7_ECX_GFNI (1 << 8) +%define FLAG_CPUID7_ECX_VAES (1 << 9) +%define FLAG_CPUID7_ECX_VPCLMULQDQ (1 << 10) +%define FLAG_CPUID7_ECX_VNNI (1 << 11) +%define FLAG_CPUID7_ECX_BITALG (1 << 12) +%define FLAG_CPUID7_ECX_VPOPCNTDQ (1 << 14) + +%define FLAGS_CPUID7_EBX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ) +%define FLAGS_CPUID7_ECX_AVX512_G2 (FLAG_CPUID7_ECX_AVX512VBMI2 | FLAG_CPUID7_ECX_GFNI | FLAG_CPUID7_ECX_VAES | FLAG_CPUID7_ECX_VPCLMULQDQ | FLAG_CPUID7_ECX_VNNI | FLAG_CPUID7_ECX_BITALG | FLAG_CPUID7_ECX_VPOPCNTDQ) %define FLAG_XGETBV_EAX_XMM (1<<1) %define FLAG_XGETBV_EAX_YMM (1<<2)