isa-l/mem/mem_zero_detect_avx2.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%include "reg_sizes.asm"

%ifidn __OUTPUT_FORMAT__, elf64
 %define arg0  rdi
 %define arg1  rsi
 %define arg2  rdx
 %define arg3  rcx
 %define arg4  r8
 %define arg5  r9
 %define tmp   r11
 %define tmpb  r11b
 %define tmp3  arg4
 %define return rax
 %define func(x) x: endbranch
 %define FUNC_SAVE
 %define FUNC_RESTORE
%endif

%ifidn __OUTPUT_FORMAT__, win64
 %define arg0  rcx
 %define arg1  rdx
 %define arg2  r8
 %define arg3  r9
 %define tmp   r11
 %define tmpb  r11b
 %define tmp3  r10
 %define return rax
 %define func(x) proc_frame x
 %macro FUNC_SAVE 0
	end_prolog
 %endmacro
 %macro FUNC_RESTORE 0
 %endmacro
%endif

%define src arg0
%define	len arg1
%define tmp0 arg2
%define tmp1 arg3

%use smartalign
ALIGNMODE P6
default rel

[bits 64]
section .text
align 32	; maximize mu-ops cache usage
mk_global  mem_zero_detect_avx2, function
func(mem_zero_detect_avx2)
	FUNC_SAVE
	cmp	len, 127
	jbe	.mem_z_small_block
	; check the first 128 bytes
	vpxor	xmm2, xmm2, xmm2
	vmovdqu ymm0, [src]
	vpor	ymm0, ymm0, [src+32]
	vmovdqu	ymm1, [src+64]
	vpor	ymm1, ymm1, [src+96]
	vpor	ymm0, ymm0, ymm1
	vpcmpeqb ymm0, ymm2, ymm0
	vpmovmskb DWORD(tmp0), ymm0
	not	DWORD(tmp0)
	mov	DWORD(tmp1), DWORD(len)
	and	DWORD(tmp1), 127
	add	src, tmp1
	xor	eax, eax
	shr	len, 7	; len/128
	test	len, len; break partial flag stall
	setz	al	; if len < 128, eax != 0
	add	eax, DWORD(tmp0) ; jump if (edx OR eax) !=0, use add for macrofusion
	jnz .return
	xor	eax, eax

align 16
.mem_z_loop:
	vmovdqu	ymm0, [src]
	vpor	ymm0, ymm0,[src+32]
	vmovdqu	ymm1, [src+64]
	vpor	ymm1, ymm1, [src+96]
	add	src, 128
	xor	DWORD(tmp1), DWORD(tmp1)
	sub	len, 1
	setz	BYTE(tmp1)
	vpor	ymm0, ymm0, ymm1
	vpcmpeqb ymm0, ymm2, ymm0
	vpmovmskb DWORD(tmp0), ymm0
	not	DWORD(tmp0)
	add	DWORD(tmp1), DWORD(tmp0)
	jz	.mem_z_loop

.return:
	xor	eax, eax
	test	tmp0, tmp0
	setnz	al
	FUNC_RESTORE
	ret


align 16
.mem_z_small_block:
	;len < 128
	xor	DWORD(tmp0), DWORD(tmp0)
	movzx	DWORD(tmp1), BYTE(len)
	cmp	DWORD(len), 16
	jb     .mem_z_small_check_zero
	;17 < len < 128
	shr	DWORD(len), 4
	xor	eax, eax ; alignment
.mem_z_small_block_loop:
	xor	eax, eax
	mov	tmp0, [src]
	or	tmp0, [src+8]
	sub	DWORD(len), 1
	setz	al
	add	src, 16
	add	rax, tmp0
	jz	.mem_z_small_block_loop

	test	tmp0, tmp0
	jnz	.return_small
	movzx	DWORD(len), BYTE(tmp1)

.mem_z_small_check_zero:
	xor	DWORD(tmp0), DWORD(tmp0)
	and	DWORD(len), 15
	jz	.return_small
.mem_z_small_byte_loop:
	movzx	eax, byte [src]
	add	src, 1
	or	DWORD(tmp0), eax
	sub	DWORD(len), 1
	jnz	.mem_z_small_byte_loop
.return_small:
	xor	eax, eax
	test	tmp0, tmp0
	setnz	al
	ret

endproc_frame