mem: Move new mem_zero_detect function to avx2

New mem_zero_detect function will fail on avx only machines. Change-Id: I3bca49bff886f9c130c89e8c74b31110e9bac76b Signed-off-by: Greg Tucker <greg.b.tucker@intel.com>
2024-12-12 09:23:50 +01:00 · 2021-09-30 17:04:08 -07:00 · 2021-09-30 17:04:08 -07:00 · 87908c9060
commit 87908c9060
parent 0e65117138
5 changed files with 274 additions and 82 deletions
--- a/Makefile.nmake
+++ b/Makefile.nmake
@ -159,6 +159,7 @@ objs = \
 	bin\igzip_set_long_icf_fg_04.obj \
 	bin\igzip_set_long_icf_fg_06.obj \
 	bin\mem_zero_detect_avx.obj \
+	bin\mem_zero_detect_avx2.obj \
 	bin\mem_zero_detect_sse.obj \
 	bin\mem_multibinary.obj

--- a/mem/Makefile.am
+++ b/mem/Makefile.am
@ -35,6 +35,7 @@ lsrc_base_aliases += mem/mem_zero_detect_base_aliases.c
 lsrc_ppc64le      += mem/mem_zero_detect_base_aliases.c

 lsrc_x86_64 += 	mem/mem_zero_detect_avx.asm \
+		mem/mem_zero_detect_avx2.asm \
 		mem/mem_zero_detect_sse.asm \
 		mem/mem_multibinary.asm

--- a/mem/mem_multibinary.asm
+++ b/mem/mem_multibinary.asm
@ -33,10 +33,11 @@
 default rel
 [bits 64]

+extern mem_zero_detect_avx2
 extern mem_zero_detect_avx
 extern mem_zero_detect_sse
 extern mem_zero_detect_base

 mbin_interface isal_zero_detect

-mbin_dispatch_init5 isal_zero_detect, mem_zero_detect_base, mem_zero_detect_sse, mem_zero_detect_avx, mem_zero_detect_avx
+mbin_dispatch_init5 isal_zero_detect, mem_zero_detect_base, mem_zero_detect_sse, mem_zero_detect_avx, mem_zero_detect_avx2
--- a/mem/mem_zero_detect_avx.asm
+++ b/mem/mem_zero_detect_avx.asm
@ -64,105 +64,126 @@

 %define src arg0
 %define	len arg1
-%define tmp0 arg2
-%define tmp1 arg3
+%define ptr arg2
+%define pos return

-%use smartalign
-ALIGNMODE P6
 default rel

 [bits 64]
 section .text
-align 32	; maximize mu-ops cache usage
+
+align 16
 mk_global  mem_zero_detect_avx, function
 func(mem_zero_detect_avx)
 	FUNC_SAVE
-	cmp	len, 127
-	jbe	.mem_z_small_block
-	; check the first 128 bytes
-	vpxor	xmm2, xmm2, xmm2
-	vmovdqu ymm0, [src]
-	vpor	ymm0, ymm0, [src+32]
-	vmovdqu	ymm1, [src+64]
-	vpor	ymm1, ymm1, [src+96]
-	vpor	ymm0, ymm0, ymm1
-	vpcmpeqb ymm0, ymm2, ymm0
-	vpmovmskb DWORD(tmp0), ymm0
-	not	DWORD(tmp0)
-	mov	DWORD(tmp1), DWORD(len)
-	and	DWORD(tmp1), 127
-	add	src, tmp1
-	xor	eax, eax
-	shr	len, 7	; len/128
-	test	len, len; break partial flag stall
-	setz	al	; if len < 128, eax != 0
-	add	eax, DWORD(tmp0) ; jump if (edx OR eax) !=0, use add for macrofusion
-	jnz .return
-	xor	eax, eax
+	mov	pos, 0
+	sub	len, 4*32
+	jle	.mem_z_small_block

-align 16
 .mem_z_loop:
-	vmovdqu	ymm0, [src]
-	vpor	ymm0, ymm0,[src+32]
-	vmovdqu	ymm1, [src+64]
-	vpor	ymm1, ymm1, [src+96]
-	add	src, 128
-	xor	DWORD(tmp1), DWORD(tmp1)
-	sub	len, 1
-	setz	BYTE(tmp1)
-	vpor	ymm0, ymm0, ymm1
-	vpcmpeqb ymm0, ymm2, ymm0
-	vpmovmskb DWORD(tmp0), ymm0
-	not	DWORD(tmp0)
-	add	DWORD(tmp1), DWORD(tmp0)
-	jz	.mem_z_loop
+	vmovdqu	ymm0, [src+pos]
+	vmovdqu	ymm1, [src+pos+1*32]
+	vmovdqu	ymm2, [src+pos+2*32]
+	vmovdqu	ymm3, [src+pos+3*32]
+	vptest	ymm0, ymm0
+	jnz	.return_fail
+	vptest	ymm1, ymm1
+	jnz	.return_fail
+	vptest	ymm2, ymm2
+	jnz	.return_fail
+	vptest	ymm3, ymm3
+	jnz	.return_fail
+	add	pos, 4*32
+	cmp	pos, len
+	jl	.mem_z_loop

-.return:
-	xor	eax, eax
-	test	tmp0, tmp0
-	setnz	al
+.mem_z_last_block:
+	vmovdqu	ymm0, [src+len]
+	vmovdqu	ymm1, [src+len+1*32]
+	vmovdqu	ymm2, [src+len+2*32]
+	vmovdqu	ymm3, [src+len+3*32]
+	vptest	ymm0, ymm0
+	jnz	.return_fail
+	vptest	ymm1, ymm1
+	jnz	.return_fail
+	vptest	ymm2, ymm2
+	jnz	.return_fail
+	vptest	ymm3, ymm3
+	jnz	.return_fail
+
+.return_pass:
+	mov	return, 0
 	FUNC_RESTORE
 	ret


-align 16
 .mem_z_small_block:
-	;len < 128
-	xor	DWORD(tmp0), DWORD(tmp0)
-	movzx	DWORD(tmp1), BYTE(len)
-	cmp	DWORD(len), 16
-	jb     .mem_z_small_check_zero
-	;17 < len < 128
-	shr	DWORD(len), 4
-	xor	eax, eax ; alignment
-.mem_z_small_block_loop:
-	xor	eax, eax
-	mov	tmp0, [src]
-	or	tmp0, [src+8]
-	sub	DWORD(len), 1
-	setz	al
-	add	src, 16
-	add	rax, tmp0
-	jz	.mem_z_small_block_loop
+	add	len, 4*32
+	cmp	len, 2*32
+	jl	.mem_z_lt64
+	vmovdqu	ymm0, [src]
+	vmovdqu	ymm1, [src+32]
+	vmovdqu	ymm2, [src+len-2*32]
+	vmovdqu	ymm3, [src+len-1*32]
+	vptest	ymm0, ymm0
+	jnz	.return_fail
+	vptest	ymm1, ymm1
+	jnz	.return_fail
+	vptest	ymm2, ymm2
+	jnz	.return_fail
+	vptest	ymm3, ymm3
+	jnz	.return_fail
+	jmp	.return_pass

-	test	tmp0, tmp0
-	jnz	.return_small
-	movzx	DWORD(len), BYTE(tmp1)
+.mem_z_lt64:
+	cmp	len, 32
+	jl	.mem_z_lt32
+	vmovdqu	ymm0, [src]
+	vmovdqu	ymm1, [src+len-32]
+	vptest	ymm0, ymm0
+	jnz	.return_fail
+	vptest	ymm1, ymm1
+	jnz	.return_fail
+	jmp	.return_pass

-.mem_z_small_check_zero:
-	xor	DWORD(tmp0), DWORD(tmp0)
-	and	DWORD(len), 15
-	jz	.return_small
-.mem_z_small_byte_loop:
-	movzx	eax, byte [src]
-	add	src, 1
-	or	DWORD(tmp0), eax
-	sub	DWORD(len), 1
-	jnz	.mem_z_small_byte_loop
-.return_small:
-	xor	eax, eax
-	test	tmp0, tmp0
-	setnz	al
+
+.mem_z_lt32:
+	cmp	len, 16
+	jl	.mem_z_lt16
+	vmovdqu	xmm0, [src]
+	vmovdqu	xmm1, [src+len-16]
+	vptest	xmm0, xmm0
+	jnz	.return_fail
+	vptest	xmm1, xmm1
+	jnz	.return_fail
+	jmp	.return_pass
+
+
+.mem_z_lt16:
+	cmp	len, 8
+	jl	.mem_z_lt8
+	mov	tmp, [src]
+	mov	tmp3,[src+len-8]
+	or	tmp, tmp3
+	test	tmp, tmp
+	jnz	.return_fail
+	jmp	.return_pass
+
+.mem_z_lt8:
+	cmp	len, 0
+	je	.return_pass
+.mem_z_1byte_loop:
+	mov	tmpb, [src+pos]
+	cmp	tmpb, 0
+	jnz	.return_fail
+	add	pos, 1
+	cmp	pos, len
+	jl	.mem_z_1byte_loop
+	jmp	.return_pass
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
 	ret

 endproc_frame
--- a/mem/mem_zero_detect_avx2.asm
+++ b/mem/mem_zero_detect_avx2.asm
@ -0,0 +1,168 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmpb  r11b
+ %define tmp3  arg4
+ %define return rax
+ %define func(x) x: endbranch
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define tmp   r11
+ %define tmpb  r11b
+ %define tmp3  r10
+ %define return rax
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	end_prolog
+ %endmacro
+ %macro FUNC_RESTORE 0
+ %endmacro
+%endif
+
+%define src arg0
+%define	len arg1
+%define tmp0 arg2
+%define tmp1 arg3
+
+%use smartalign
+ALIGNMODE P6
+default rel
+
+[bits 64]
+section .text
+align 32	; maximize mu-ops cache usage
+mk_global  mem_zero_detect_avx2, function
+func(mem_zero_detect_avx2)
+	FUNC_SAVE
+	cmp	len, 127
+	jbe	.mem_z_small_block
+	; check the first 128 bytes
+	vpxor	xmm2, xmm2, xmm2
+	vmovdqu ymm0, [src]
+	vpor	ymm0, ymm0, [src+32]
+	vmovdqu	ymm1, [src+64]
+	vpor	ymm1, ymm1, [src+96]
+	vpor	ymm0, ymm0, ymm1
+	vpcmpeqb ymm0, ymm2, ymm0
+	vpmovmskb DWORD(tmp0), ymm0
+	not	DWORD(tmp0)
+	mov	DWORD(tmp1), DWORD(len)
+	and	DWORD(tmp1), 127
+	add	src, tmp1
+	xor	eax, eax
+	shr	len, 7	; len/128
+	test	len, len; break partial flag stall
+	setz	al	; if len < 128, eax != 0
+	add	eax, DWORD(tmp0) ; jump if (edx OR eax) !=0, use add for macrofusion
+	jnz .return
+	xor	eax, eax
+
+align 16
+.mem_z_loop:
+	vmovdqu	ymm0, [src]
+	vpor	ymm0, ymm0,[src+32]
+	vmovdqu	ymm1, [src+64]
+	vpor	ymm1, ymm1, [src+96]
+	add	src, 128
+	xor	DWORD(tmp1), DWORD(tmp1)
+	sub	len, 1
+	setz	BYTE(tmp1)
+	vpor	ymm0, ymm0, ymm1
+	vpcmpeqb ymm0, ymm2, ymm0
+	vpmovmskb DWORD(tmp0), ymm0
+	not	DWORD(tmp0)
+	add	DWORD(tmp1), DWORD(tmp0)
+	jz	.mem_z_loop
+
+.return:
+	xor	eax, eax
+	test	tmp0, tmp0
+	setnz	al
+	FUNC_RESTORE
+	ret
+
+
+align 16
+.mem_z_small_block:
+	;len < 128
+	xor	DWORD(tmp0), DWORD(tmp0)
+	movzx	DWORD(tmp1), BYTE(len)
+	cmp	DWORD(len), 16
+	jb     .mem_z_small_check_zero
+	;17 < len < 128
+	shr	DWORD(len), 4
+	xor	eax, eax ; alignment
+.mem_z_small_block_loop:
+	xor	eax, eax
+	mov	tmp0, [src]
+	or	tmp0, [src+8]
+	sub	DWORD(len), 1
+	setz	al
+	add	src, 16
+	add	rax, tmp0
+	jz	.mem_z_small_block_loop
+
+	test	tmp0, tmp0
+	jnz	.return_small
+	movzx	DWORD(len), BYTE(tmp1)
+
+.mem_z_small_check_zero:
+	xor	DWORD(tmp0), DWORD(tmp0)
+	and	DWORD(len), 15
+	jz	.return_small
+.mem_z_small_byte_loop:
+	movzx	eax, byte [src]
+	add	src, 1
+	or	DWORD(tmp0), eax
+	sub	DWORD(len), 1
+	jnz	.mem_z_small_byte_loop
+.return_small:
+	xor	eax, eax
+	test	tmp0, tmp0
+	setnz	al
+	ret
+
+endproc_frame