isa-l/include/multibinary.asm
Pablo de Lara 94690d01ca Remove 32-bit x86 architecture support
As already announced in issue #296, we are removing 32-bit x86 support,
which was not being validated anyway.

Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
2025-05-08 18:37:08 +01:00

479 lines
11 KiB
NASM

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%ifndef _MULTIBINARY_ASM_
%define _MULTIBINARY_ASM_
%define dq dq
%define ptr_sz qword
%define rdi rdi
%define rsi rsi
%define rax rax
%define rbx rbx
%define rcx rcx
%define rdx rdx
;;;;
; multibinary macro:
; creates the visible entry point that uses HW optimized call pointer
; creates the init of the HW optimized call pointer
;;;;
%macro mbin_interface 1
;;;;
; *_dispatched is defaulted to *_mbinit and replaced on first call.
; Therefore, *_dispatch_init is only executed on first call.
;;;;
section .data
%1_dispatched:
dq %1_mbinit
section .text
mk_global %1, function
%1_mbinit:
endbranch
;;; only called the first time to setup hardware match
call %1_dispatch_init
;;; falls thru to execute the hw optimized code
%1:
endbranch
jmp qword [%1_dispatched]
%endmacro
;;;;;
; mbin_dispatch_init parameters
; Use this function when SSE/00/01 is a minimum requirement
; 1-> function name
; 2-> SSE/00/01 optimized function used as base
; 3-> AVX or AVX/02 opt func
; 4-> AVX2 or AVX/04 opt func
;;;;;
%macro mbin_dispatch_init 4
section .text
%1_dispatch_init:
push rsi
push rax
push rbx
push rcx
push rdx
lea rsi, [%2 WRT_OPT] ; Default to SSE 00/01
mov eax, 1
cpuid
and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
lea rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
jne _%1_init_done ; AVX is not available so end
mov rsi, rbx
;; Try for AVX2
xor ecx, ecx
mov eax, 7
cpuid
test ebx, FLAG_CPUID7_EBX_AVX2
lea rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
cmovne rsi, rbx
;; Does it have xmm and ymm support
xor ecx, ecx
xgetbv
and eax, FLAG_XGETBV_EAX_XMM_YMM
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
je _%1_init_done
lea rsi, [%2 WRT_OPT]
_%1_init_done:
pop rdx
pop rcx
pop rbx
pop rax
mov [%1_dispatched], rsi
pop rsi
ret
%endmacro
;;;;;
; mbin_dispatch_init_clmul 3 parameters
; Use this case for CRC which needs both SSE4_1 and CLMUL
; 1-> function name
; 2-> base function
; 3-> SSE4_1 and CLMUL optimized function
; 4-> AVX/02 opt func
; 5-> AVX512/10 opt func
;;;;;
%macro mbin_dispatch_init_clmul 5
section .text
%1_dispatch_init:
push rsi
push rax
push rbx
push rcx
push rdx
push rdi
lea rsi, [%2 WRT_OPT] ; Default - use base function
mov eax, 1
cpuid
mov ebx, ecx ; save cpuid1.ecx
test ecx, FLAG_CPUID1_ECX_SSE4_1
jz _%1_init_done
test ecx, FLAG_CPUID1_ECX_CLMUL
jz _%1_init_done
lea rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
;; Test for XMM_YMM support/AVX
test ecx, FLAG_CPUID1_ECX_OSXSAVE
je _%1_init_done
xor ecx, ecx
xgetbv ; xcr -> edx:eax
mov edi, eax ; save xgetvb.eax
and eax, FLAG_XGETBV_EAX_XMM_YMM
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
jne _%1_init_done
test ebx, FLAG_CPUID1_ECX_AVX
je _%1_init_done
lea rsi, [%4 WRT_OPT] ; AVX/02 opt
;; Test for AVX2
xor ecx, ecx
mov eax, 7
cpuid
test ebx, FLAG_CPUID7_EBX_AVX2
je _%1_init_done ; No AVX2 possible
;; Test for AVX512
and edi, FLAG_XGETBV_EAX_ZMM_OPM
cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
jne _%1_init_done ; No AVX512 possible
and ebx, FLAGS_CPUID7_EBX_AVX512_G1
cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
jne _%1_init_done
and ecx, FLAGS_CPUID7_ECX_AVX512_G2
cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
lea rbx, [%5 WRT_OPT] ; AVX512/10 opt
cmove rsi, rbx
_%1_init_done:
pop rdi
pop rdx
pop rcx
pop rbx
pop rax
mov [%1_dispatched], rsi
pop rsi
ret
%endmacro
;;;;;
; mbin_dispatch_init5 parameters
; 1-> function name
; 2-> base function
; 3-> SSE4_2 or 00/01 optimized function
; 4-> AVX/02 opt func
; 5-> AVX2/04 opt func
;;;;;
%macro mbin_dispatch_init5 5
section .text
%1_dispatch_init:
push rsi
push rax
push rbx
push rcx
push rdx
lea rsi, [%2 WRT_OPT] ; Default - use base function
mov eax, 1
cpuid
; Test for SSE4.2
test ecx, FLAG_CPUID1_ECX_SSE4_2
lea rbx, [%3 WRT_OPT] ; SSE opt func
cmovne rsi, rbx
and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
lea rbx, [%4 WRT_OPT] ; AVX (gen2) opt func
jne _%1_init_done ; AVX is not available so end
mov rsi, rbx
;; Try for AVX2
xor ecx, ecx
mov eax, 7
cpuid
test ebx, FLAG_CPUID7_EBX_AVX2
lea rbx, [%5 WRT_OPT] ; AVX (gen4) opt func
cmovne rsi, rbx
;; Does it have xmm and ymm support
xor ecx, ecx
xgetbv
and eax, FLAG_XGETBV_EAX_XMM_YMM
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
je _%1_init_done
lea rsi, [%3 WRT_OPT]
_%1_init_done:
pop rdx
pop rcx
pop rbx
pop rax
mov [%1_dispatched], rsi
pop rsi
ret
%endmacro
;;;;;
; mbin_dispatch_init6 parameters
; 1-> function name
; 2-> base function
; 3-> SSE4_2 or 00/01 optimized function
; 4-> AVX/02 opt func
; 5-> AVX2/04 opt func
; 6-> AVX512/06 opt func
;;;;;
%macro mbin_dispatch_init6 6
section .text
%1_dispatch_init:
push rsi
push rax
push rbx
push rcx
push rdx
push rdi
lea rsi, [%2 WRT_OPT] ; Default - use base function
mov eax, 1
cpuid
mov ebx, ecx ; save cpuid1.ecx
test ecx, FLAG_CPUID1_ECX_SSE4_2
je _%1_init_done ; Use base function if no SSE4_2
lea rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
;; Test for XMM_YMM support/AVX
test ecx, FLAG_CPUID1_ECX_OSXSAVE
je _%1_init_done
xor ecx, ecx
xgetbv ; xcr -> edx:eax
mov edi, eax ; save xgetvb.eax
and eax, FLAG_XGETBV_EAX_XMM_YMM
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
jne _%1_init_done
test ebx, FLAG_CPUID1_ECX_AVX
je _%1_init_done
lea rsi, [%4 WRT_OPT] ; AVX/02 opt
;; Test for AVX2
xor ecx, ecx
mov eax, 7
cpuid
test ebx, FLAG_CPUID7_EBX_AVX2
je _%1_init_done ; No AVX2 possible
lea rsi, [%5 WRT_OPT] ; AVX2/04 opt func
;; Test for AVX512
and edi, FLAG_XGETBV_EAX_ZMM_OPM
cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
jne _%1_init_done ; No AVX512 possible
and ebx, FLAGS_CPUID7_EBX_AVX512_G1
cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
lea rbx, [%6 WRT_OPT] ; AVX512/06 opt
cmove rsi, rbx
_%1_init_done:
pop rdi
pop rdx
pop rcx
pop rbx
pop rax
mov [%1_dispatched], rsi
pop rsi
ret
%endmacro
;;;;;
; mbin_dispatch_init7 parameters
; 1-> function name
; 2-> base function
; 3-> SSE4_2 or 00/01 optimized function
; 4-> AVX/02 opt func
; 5-> AVX2/04 opt func
; 6-> AVX512/06 opt func
; 7-> AVX512 Update/10 opt func
;;;;;
%macro mbin_dispatch_init7 7
section .text
%1_dispatch_init:
push rsi
push rax
push rbx
push rcx
push rdx
push rdi
lea rsi, [%2 WRT_OPT] ; Default - use base function
mov eax, 1
cpuid
mov ebx, ecx ; save cpuid1.ecx
test ecx, FLAG_CPUID1_ECX_SSE4_2
je _%1_init_done ; Use base function if no SSE4_2
lea rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
;; Test for XMM_YMM support/AVX
test ecx, FLAG_CPUID1_ECX_OSXSAVE
je _%1_init_done
xor ecx, ecx
xgetbv ; xcr -> edx:eax
mov edi, eax ; save xgetvb.eax
and eax, FLAG_XGETBV_EAX_XMM_YMM
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
jne _%1_init_done
test ebx, FLAG_CPUID1_ECX_AVX
je _%1_init_done
lea rsi, [%4 WRT_OPT] ; AVX/02 opt
;; Test for AVX2
xor ecx, ecx
mov eax, 7
cpuid
test ebx, FLAG_CPUID7_EBX_AVX2
je _%1_init_done ; No AVX2 possible
lea rsi, [%5 WRT_OPT] ; AVX2/04 opt func
;; Test for AVX512
and edi, FLAG_XGETBV_EAX_ZMM_OPM
cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
jne _%1_init_done ; No AVX512 possible
and ebx, FLAGS_CPUID7_EBX_AVX512_G1
cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
lea rbx, [%6 WRT_OPT] ; AVX512/06 opt
cmove rsi, rbx
and ecx, FLAGS_CPUID7_ECX_AVX512_G2
cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
lea rbx, [%7 WRT_OPT] ; AVX512/06 opt
cmove rsi, rbx
_%1_init_done:
pop rdi
pop rdx
pop rcx
pop rbx
pop rax
mov [%1_dispatched], rsi
pop rsi
ret
%endmacro
;;;;;
; mbin_dispatch_init8 parameters
; 1-> function name
; 2-> base function
; 3-> SSE4_2 or 00/01 optimized function
; 4-> AVX/02 opt func
; 5-> AVX2/04 opt func
; 6-> AVX512/06 opt func
; 7-> AVX2 Update/07 opt func
; 8-> AVX512 Update/10 opt func
;;;;;
%macro mbin_dispatch_init8 8
section .text
%1_dispatch_init:
push rsi
push rax
push rbx
push rcx
push rdx
push rdi
lea rsi, [%2 WRT_OPT] ; Default - use base function
mov eax, 1
cpuid
mov ebx, ecx ; save cpuid1.ecx
test ecx, FLAG_CPUID1_ECX_SSE4_2
je _%1_init_done ; Use base function if no SSE4_2
lea rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
;; Test for XMM_YMM support/AVX
test ecx, FLAG_CPUID1_ECX_OSXSAVE
je _%1_init_done
xor ecx, ecx
xgetbv ; xcr -> edx:eax
mov edi, eax ; save xgetvb.eax
and eax, FLAG_XGETBV_EAX_XMM_YMM
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
jne _%1_init_done
test ebx, FLAG_CPUID1_ECX_AVX
je _%1_init_done
lea rsi, [%4 WRT_OPT] ; AVX/02 opt
;; Test for AVX2
xor ecx, ecx
mov eax, 7
cpuid
test ebx, FLAG_CPUID7_EBX_AVX2
je _%1_init_done ; No AVX2 possible
lea rsi, [%5 WRT_OPT] ; AVX2/04 opt func
;; Test for AVX512
and edi, FLAG_XGETBV_EAX_ZMM_OPM
cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
jne _%1_check_avx2_g2 ; No AVX512 possible
and ebx, FLAGS_CPUID7_EBX_AVX512_G1
cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
lea rbx, [%6 WRT_OPT] ; AVX512/06 opt
cmove rsi, rbx
and ecx, FLAGS_CPUID7_ECX_AVX512_G2
cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
lea rbx, [%8 WRT_OPT] ; AVX512/10 opt
cmove rsi, rbx
jmp _%1_init_done
_%1_check_avx2_g2:
;; Test for AVX2 Gen 2
and ecx, FLAGS_CPUID7_ECX_AVX2_G2
cmp ecx, FLAGS_CPUID7_ECX_AVX2_G2
lea rbx, [%7 WRT_OPT] ; AVX2/7 opt
cmove rsi, rbx
_%1_init_done:
pop rdi
pop rdx
pop rcx
pop rbx
pop rax
mov [%1_dispatched], rsi
pop rsi
ret
%endmacro
%endif ; ifndef _MULTIBINARY_ASM_