;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2018 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%include "reg_sizes.asm"

%ifidn __OUTPUT_FORMAT__, elf64
 %define arg0  rdi
 %define arg1  rsi
 %define arg2  rdx
 %define arg3  rcx
 %define arg4  r8
 %define arg5  r9
 %define tmp   r11
 %define tmpb  r11b
 %define tmp3  arg4
 %define return rax
 %define func(x) x: endbranch
 %define FUNC_SAVE
 %define FUNC_RESTORE
%endif

%ifidn __OUTPUT_FORMAT__, win64
 %define arg0  rcx
 %define arg1  rdx
 %define arg2  r8
 %define arg3  r9
 %define tmp   r11
 %define tmpb  r11b
 %define tmp3  r10
 %define return rax
 %define func(x) proc_frame x
 %macro FUNC_SAVE 0
	end_prolog
 %endmacro
 %macro FUNC_RESTORE 0
 %endmacro
%endif

%define src arg0
%define	len arg1
%define tmp0 arg2
%define tmp1 arg3

%use smartalign
ALIGNMODE P6
default rel

[bits 64]
section .text
align 32	; maximize mu-ops cache coverage
mk_global  mem_zero_detect_avx512, function
func(mem_zero_detect_avx512)
	FUNC_SAVE
	or	tmp1, -1	; all ones mask
	mov	eax, DWORD(src)
	and	eax, 63
	neg	rax
	add	rax, 64		; 64 - eax
	cmp	rax, len
	cmovae	eax, DWORD(len)
	bzhi	tmp1, tmp1, rax ; alignment mask
	kmovq 	k1, tmp1
	vmovdqu8 zmm0{k1}{z}, [src]
	add	src, rax	; align to cacheline
	sub	len, rax
	vptestmb k1, zmm0, zmm0
	xor	DWORD(tmp0), DWORD(tmp0)
	ktestq	k1, k1
	setnz	BYTE(tmp0)
	mov	DWORD(tmp3), DWORD(len)
	xor	eax, eax
	shr	len, 7		; len/128
	setz	al
	add	eax, DWORD(tmp0)
	jnz	.mem_z_small_block


align 16
.mem_z_loop:
	vmovdqa64	zmm0, [src]
	vporq	zmm0, zmm0,[src+64]
	xor	tmp1,tmp1
	sub	len, 1
	setz	BYTE(tmp1)
	add	src, 128
	vptestmb k1, zmm0, zmm0
	kmovq	tmp0, k1
	add	tmp1, tmp0 ; for macrofusion.
	jz	.mem_z_loop

align 16
.mem_z_small_block:
	;len < 128
	xor	eax, eax
	lea	tmp1, [rax-1]   ; 0xFFFFFF...
	mov	DWORD(len), DWORD(tmp3)
	and	DWORD(len), 127 ; len % 128
	and	DWORD(tmp3),63  ; len % 64
	bzhi	tmp, tmp1, tmp3; mask
	cmp	DWORD(len), 64
	cmovb	tmp1, tmp
	cmovb	tmp, rax
	kmovq	k1, tmp1
	kmovq	k2, tmp
	vmovdqu8 zmm0{k1}{z}, [src]
	vmovdqu8 zmm1{k2}{z}, [src+64]
	vporq	zmm0, zmm0, zmm1
	vptestmb k1, zmm0, zmm0
	kmovq	tmp1, k1
	or	tmp0, tmp1
	setnz	al	; eax is still zero
	FUNC_RESTORE
	ret


endproc_frame