diff --git a/Makefile.nmake b/Makefile.nmake index b8a34e9..d36e92f 100644 --- a/Makefile.nmake +++ b/Makefile.nmake @@ -159,6 +159,7 @@ objs = \ bin\igzip_set_long_icf_fg_04.obj \ bin\igzip_set_long_icf_fg_06.obj \ bin\mem_zero_detect_avx.obj \ + bin\mem_zero_detect_avx2.obj \ bin\mem_zero_detect_sse.obj \ bin\mem_multibinary.obj diff --git a/mem/Makefile.am b/mem/Makefile.am index ed5acfc..f6eec75 100644 --- a/mem/Makefile.am +++ b/mem/Makefile.am @@ -35,6 +35,7 @@ lsrc_base_aliases += mem/mem_zero_detect_base_aliases.c lsrc_ppc64le += mem/mem_zero_detect_base_aliases.c lsrc_x86_64 += mem/mem_zero_detect_avx.asm \ + mem/mem_zero_detect_avx2.asm \ mem/mem_zero_detect_sse.asm \ mem/mem_multibinary.asm diff --git a/mem/mem_multibinary.asm b/mem/mem_multibinary.asm index 38f63e2..6201b09 100644 --- a/mem/mem_multibinary.asm +++ b/mem/mem_multibinary.asm @@ -33,10 +33,11 @@ default rel [bits 64] +extern mem_zero_detect_avx2 extern mem_zero_detect_avx extern mem_zero_detect_sse extern mem_zero_detect_base mbin_interface isal_zero_detect -mbin_dispatch_init5 isal_zero_detect, mem_zero_detect_base, mem_zero_detect_sse, mem_zero_detect_avx, mem_zero_detect_avx +mbin_dispatch_init5 isal_zero_detect, mem_zero_detect_base, mem_zero_detect_sse, mem_zero_detect_avx, mem_zero_detect_avx2 diff --git a/mem/mem_zero_detect_avx.asm b/mem/mem_zero_detect_avx.asm index 8939579..1b5de84 100644 --- a/mem/mem_zero_detect_avx.asm +++ b/mem/mem_zero_detect_avx.asm @@ -64,105 +64,126 @@ %define src arg0 %define len arg1 -%define tmp0 arg2 -%define tmp1 arg3 +%define ptr arg2 +%define pos return -%use smartalign -ALIGNMODE P6 default rel [bits 64] section .text -align 32 ; maximize mu-ops cache usage + +align 16 mk_global mem_zero_detect_avx, function func(mem_zero_detect_avx) FUNC_SAVE - cmp len, 127 - jbe .mem_z_small_block - ; check the first 128 bytes - vpxor xmm2, xmm2, xmm2 - vmovdqu ymm0, [src] - vpor ymm0, ymm0, [src+32] - vmovdqu ymm1, [src+64] - vpor ymm1, ymm1, [src+96] - vpor ymm0, ymm0, ymm1 - vpcmpeqb ymm0, ymm2, ymm0 - vpmovmskb DWORD(tmp0), ymm0 - not DWORD(tmp0) - mov DWORD(tmp1), DWORD(len) - and DWORD(tmp1), 127 - add src, tmp1 - xor eax, eax - shr len, 7 ; len/128 - test len, len; break partial flag stall - setz al ; if len < 128, eax != 0 - add eax, DWORD(tmp0) ; jump if (edx OR eax) !=0, use add for macrofusion - jnz .return - xor eax, eax + mov pos, 0 + sub len, 4*32 + jle .mem_z_small_block -align 16 .mem_z_loop: - vmovdqu ymm0, [src] - vpor ymm0, ymm0,[src+32] - vmovdqu ymm1, [src+64] - vpor ymm1, ymm1, [src+96] - add src, 128 - xor DWORD(tmp1), DWORD(tmp1) - sub len, 1 - setz BYTE(tmp1) - vpor ymm0, ymm0, ymm1 - vpcmpeqb ymm0, ymm2, ymm0 - vpmovmskb DWORD(tmp0), ymm0 - not DWORD(tmp0) - add DWORD(tmp1), DWORD(tmp0) - jz .mem_z_loop + vmovdqu ymm0, [src+pos] + vmovdqu ymm1, [src+pos+1*32] + vmovdqu ymm2, [src+pos+2*32] + vmovdqu ymm3, [src+pos+3*32] + vptest ymm0, ymm0 + jnz .return_fail + vptest ymm1, ymm1 + jnz .return_fail + vptest ymm2, ymm2 + jnz .return_fail + vptest ymm3, ymm3 + jnz .return_fail + add pos, 4*32 + cmp pos, len + jl .mem_z_loop -.return: - xor eax, eax - test tmp0, tmp0 - setnz al +.mem_z_last_block: + vmovdqu ymm0, [src+len] + vmovdqu ymm1, [src+len+1*32] + vmovdqu ymm2, [src+len+2*32] + vmovdqu ymm3, [src+len+3*32] + vptest ymm0, ymm0 + jnz .return_fail + vptest ymm1, ymm1 + jnz .return_fail + vptest ymm2, ymm2 + jnz .return_fail + vptest ymm3, ymm3 + jnz .return_fail + +.return_pass: + mov return, 0 FUNC_RESTORE ret -align 16 .mem_z_small_block: - ;len < 128 - xor DWORD(tmp0), DWORD(tmp0) - movzx DWORD(tmp1), BYTE(len) - cmp DWORD(len), 16 - jb .mem_z_small_check_zero - ;17 < len < 128 - shr DWORD(len), 4 - xor eax, eax ; alignment -.mem_z_small_block_loop: - xor eax, eax - mov tmp0, [src] - or tmp0, [src+8] - sub DWORD(len), 1 - setz al - add src, 16 - add rax, tmp0 - jz .mem_z_small_block_loop + add len, 4*32 + cmp len, 2*32 + jl .mem_z_lt64 + vmovdqu ymm0, [src] + vmovdqu ymm1, [src+32] + vmovdqu ymm2, [src+len-2*32] + vmovdqu ymm3, [src+len-1*32] + vptest ymm0, ymm0 + jnz .return_fail + vptest ymm1, ymm1 + jnz .return_fail + vptest ymm2, ymm2 + jnz .return_fail + vptest ymm3, ymm3 + jnz .return_fail + jmp .return_pass - test tmp0, tmp0 - jnz .return_small - movzx DWORD(len), BYTE(tmp1) +.mem_z_lt64: + cmp len, 32 + jl .mem_z_lt32 + vmovdqu ymm0, [src] + vmovdqu ymm1, [src+len-32] + vptest ymm0, ymm0 + jnz .return_fail + vptest ymm1, ymm1 + jnz .return_fail + jmp .return_pass -.mem_z_small_check_zero: - xor DWORD(tmp0), DWORD(tmp0) - and DWORD(len), 15 - jz .return_small -.mem_z_small_byte_loop: - movzx eax, byte [src] - add src, 1 - or DWORD(tmp0), eax - sub DWORD(len), 1 - jnz .mem_z_small_byte_loop -.return_small: - xor eax, eax - test tmp0, tmp0 - setnz al + +.mem_z_lt32: + cmp len, 16 + jl .mem_z_lt16 + vmovdqu xmm0, [src] + vmovdqu xmm1, [src+len-16] + vptest xmm0, xmm0 + jnz .return_fail + vptest xmm1, xmm1 + jnz .return_fail + jmp .return_pass + + +.mem_z_lt16: + cmp len, 8 + jl .mem_z_lt8 + mov tmp, [src] + mov tmp3,[src+len-8] + or tmp, tmp3 + test tmp, tmp + jnz .return_fail + jmp .return_pass + +.mem_z_lt8: + cmp len, 0 + je .return_pass +.mem_z_1byte_loop: + mov tmpb, [src+pos] + cmp tmpb, 0 + jnz .return_fail + add pos, 1 + cmp pos, len + jl .mem_z_1byte_loop + jmp .return_pass + +.return_fail: + mov return, 1 + FUNC_RESTORE ret endproc_frame diff --git a/mem/mem_zero_detect_avx2.asm b/mem/mem_zero_detect_avx2.asm new file mode 100644 index 0000000..b52774b --- /dev/null +++ b/mem/mem_zero_detect_avx2.asm @@ -0,0 +1,168 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmpb r11b + %define tmp3 arg4 + %define return rax + %define func(x) x: endbranch + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define tmp r11 + %define tmpb r11b + %define tmp3 r10 + %define return rax + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + end_prolog + %endmacro + %macro FUNC_RESTORE 0 + %endmacro +%endif + +%define src arg0 +%define len arg1 +%define tmp0 arg2 +%define tmp1 arg3 + +%use smartalign +ALIGNMODE P6 +default rel + +[bits 64] +section .text +align 32 ; maximize mu-ops cache usage +mk_global mem_zero_detect_avx2, function +func(mem_zero_detect_avx2) + FUNC_SAVE + cmp len, 127 + jbe .mem_z_small_block + ; check the first 128 bytes + vpxor xmm2, xmm2, xmm2 + vmovdqu ymm0, [src] + vpor ymm0, ymm0, [src+32] + vmovdqu ymm1, [src+64] + vpor ymm1, ymm1, [src+96] + vpor ymm0, ymm0, ymm1 + vpcmpeqb ymm0, ymm2, ymm0 + vpmovmskb DWORD(tmp0), ymm0 + not DWORD(tmp0) + mov DWORD(tmp1), DWORD(len) + and DWORD(tmp1), 127 + add src, tmp1 + xor eax, eax + shr len, 7 ; len/128 + test len, len; break partial flag stall + setz al ; if len < 128, eax != 0 + add eax, DWORD(tmp0) ; jump if (edx OR eax) !=0, use add for macrofusion + jnz .return + xor eax, eax + +align 16 +.mem_z_loop: + vmovdqu ymm0, [src] + vpor ymm0, ymm0,[src+32] + vmovdqu ymm1, [src+64] + vpor ymm1, ymm1, [src+96] + add src, 128 + xor DWORD(tmp1), DWORD(tmp1) + sub len, 1 + setz BYTE(tmp1) + vpor ymm0, ymm0, ymm1 + vpcmpeqb ymm0, ymm2, ymm0 + vpmovmskb DWORD(tmp0), ymm0 + not DWORD(tmp0) + add DWORD(tmp1), DWORD(tmp0) + jz .mem_z_loop + +.return: + xor eax, eax + test tmp0, tmp0 + setnz al + FUNC_RESTORE + ret + + +align 16 +.mem_z_small_block: + ;len < 128 + xor DWORD(tmp0), DWORD(tmp0) + movzx DWORD(tmp1), BYTE(len) + cmp DWORD(len), 16 + jb .mem_z_small_check_zero + ;17 < len < 128 + shr DWORD(len), 4 + xor eax, eax ; alignment +.mem_z_small_block_loop: + xor eax, eax + mov tmp0, [src] + or tmp0, [src+8] + sub DWORD(len), 1 + setz al + add src, 16 + add rax, tmp0 + jz .mem_z_small_block_loop + + test tmp0, tmp0 + jnz .return_small + movzx DWORD(len), BYTE(tmp1) + +.mem_z_small_check_zero: + xor DWORD(tmp0), DWORD(tmp0) + and DWORD(len), 15 + jz .return_small +.mem_z_small_byte_loop: + movzx eax, byte [src] + add src, 1 + or DWORD(tmp0), eax + sub DWORD(len), 1 + jnz .mem_z_small_byte_loop +.return_small: + xor eax, eax + test tmp0, tmp0 + setnz al + ret + +endproc_frame