From e1f5284ff8dfe7c1c366e6c2ed712b71e4700634 Mon Sep 17 00:00:00 2001 From: Greg Tucker Date: Thu, 25 May 2017 13:51:25 -0700 Subject: [PATCH] igzip: Add sse optimized adler32 checksum Change-Id: Id07727b8a8da4b41aa983b487ca881552d5190ee Signed-off-by: Greg Tucker --- igzip/Makefile.am | 1 + igzip/adler32_sse.asm | 249 ++++++++++++++++++++++++++++++++++++ igzip/encode_df_04.asm | 2 +- igzip/igzip_multibinary.asm | 3 +- 4 files changed, 253 insertions(+), 2 deletions(-) create mode 100644 igzip/adler32_sse.asm diff --git a/igzip/Makefile.am b/igzip/Makefile.am index 90490a2..7263098 100644 --- a/igzip/Makefile.am +++ b/igzip/Makefile.am @@ -49,6 +49,7 @@ lsrc_x86_64 += \ igzip/igzip_icf_finish.asm \ igzip/rfc1951_lookup.asm \ igzip/crc32_gzip.asm igzip/detect_repeated_char.asm \ + igzip/adler32_sse.asm \ igzip/adler32_avx2_4.asm \ igzip/igzip_multibinary.asm \ igzip/igzip_update_histogram_01.asm \ diff --git a/igzip/adler32_sse.asm b/igzip/adler32_sse.asm new file mode 100644 index 0000000..92c5327 --- /dev/null +++ b/igzip/adler32_sse.asm @@ -0,0 +1,249 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len) + +%define LIMIT 5552 +%define BASE 0xFFF1 ; 65521 + +%include "reg_sizes.asm" + +default rel +[bits 64] + +; need to keep free: eax, ecx, edx + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg1 rdi + %define arg2 rsi + %define arg3 rdx + + %define init_d edi + %define data r9 + %define size r10 + %define s r11 + %define a_d r12d + %define b_d r8d + %define end r13 + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + %endmacro +%macro FUNC_RESTORE 0 + pop r13 + pop r12 + %endmacro +%endif + + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg1 rcx + %define arg2 rdx + %define arg3 r8 + + %define init_d r12d + %define data r9 + %define size r10 + %define s r11 + %define a_d esi + %define b_d edi + %define end r13 + + %define stack_size 5*8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_reg rdi, 0*8 + save_reg rsi, 1*8 + save_reg r12, 2*8 + save_reg r13, 3*8 + end_prolog + mov init_d, ecx ; initalize init_d from arg1 to keep ecx free + %endmacro + + %macro FUNC_RESTORE 0 + mov rdi, [rsp + 0*8] + mov rsi, [rsp + 1*8] + mov r12, [rsp + 2*8] + mov r13, [rsp + 3*8] + add rsp, stack_size + %endmacro +%endif + +%define xa xmm0 +%define xb xmm1 +%define xdata0 xmm2 +%define xdata1 xmm3 +%define xsa xmm4 + +global adler32_sse:function +func(adler32_sse) + FUNC_SAVE + + mov data, arg2 + mov size, arg3 + + mov b_d, init_d + shr b_d, 16 + and init_d, 0xFFFF + cmp size, 32 + jb .lt64 + movd xa, init_d + pxor xb, xb +.sloop1: + mov s, LIMIT + cmp s, size + cmova s, size ; s = min(size, LIMIT) + lea end, [data + s - 7] + cmp data, end + jae .skip_loop_1a +align 32 +.sloop1a: + ; do 8 adds + pmovzxbd xdata0, [data] + pmovzxbd xdata1, [data + 4] + add data, 8 + paddd xa, xdata0 + paddd xb, xa + paddd xa, xdata1 + paddd xb, xa + cmp data, end + jb .sloop1a + +.skip_loop_1a: + add end, 7 + + test s, 7 + jnz .do_final + + ; either we're done, or we just did LIMIT + sub size, s + + ; reduce + pslld xb, 2 ; b is scaled by 4 + movdqa xsa, xa ; scaled a + pmulld xsa, [A_SCALE] + + phaddd xa, xa + phaddd xb, xb + phaddd xsa, xsa + phaddd xa, xa + phaddd xb, xb + phaddd xsa, xsa + + movd eax, xa + xor edx, edx + mov ecx, BASE + div ecx ; divide edx:eax by ecx, quot->eax, rem->edx + mov a_d, edx + + psubd xb, xsa + movd eax, xb + add eax, b_d + xor edx, edx + mov ecx, BASE + div ecx ; divide edx:eax by ecx, quot->eax, rem->edx + mov b_d, edx + + test size, size + jz .finish + + ; continue loop + movd xa, a_d + pxor xb, xb + jmp .sloop1 + +.finish: + mov eax, b_d + shl eax, 16 + or eax, a_d + jmp .end + +.lt64: + mov a_d, init_d + lea end, [data + size] + test size, size + jnz .final_loop + jmp .zero_size + + ; handle remaining 1...15 bytes +.do_final: + ; reduce + pslld xb, 2 ; b is scaled by 4 + movdqa xsa, xa ; scaled a + pmulld xsa, [A_SCALE] + + phaddd xa, xa + phaddd xb, xb + phaddd xsa, xsa + phaddd xa, xa + phaddd xb, xb + phaddd xsa, xsa + psubd xb, xsa + + movd a_d, xa + movd eax, xb + add b_d, eax + +align 32 +.final_loop: + movzx eax, byte[data] + add a_d, eax + inc data + add b_d, a_d + cmp data, end + jb .final_loop + +.zero_size: + mov eax, a_d + xor edx, edx + mov ecx, BASE + div ecx ; divide edx:eax by ecx, quot->eax, rem->edx + mov a_d, edx + + mov eax, b_d + xor edx, edx + mov ecx, BASE + div ecx ; divide edx:eax by ecx, quot->eax, rem->edx + shl edx, 16 + or edx, a_d + mov eax, edx + +.end: + FUNC_RESTORE + ret + +endproc_frame + +section .data +align 32 +A_SCALE: + dq 0x0000000100000000, 0x0000000300000002 diff --git a/igzip/encode_df_04.asm b/igzip/encode_df_04.asm index 2e60867..44b4755 100644 --- a/igzip/encode_df_04.asm +++ b/igzip/encode_df_04.asm @@ -277,7 +277,7 @@ encode_deflate_icf_ %+ ARCH: ;; Check for short codes vptest code_lens2, [min_write_mask] jz .short_codes -.short_codes_next +.short_codes_next: vpermq codes2, codes2, 0x45 vpor codes1, codes1, codes2 diff --git a/igzip/igzip_multibinary.asm b/igzip/igzip_multibinary.asm index 77d984a..516be0e 100644 --- a/igzip/igzip_multibinary.asm +++ b/igzip/igzip_multibinary.asm @@ -69,6 +69,7 @@ extern crc32_gzip_01 extern adler32_base extern adler32_avx2_4 +extern adler32_sse section .text @@ -99,4 +100,4 @@ mbin_interface crc32_gzip mbin_dispatch_init5 crc32_gzip, crc32_gzip_base, crc32_gzip_base, crc32_gzip_01, crc32_gzip_01 mbin_interface isal_adler32 -mbin_dispatch_init5 isal_adler32, adler32_base, adler32_base, adler32_base, adler32_avx2_4 +mbin_dispatch_init5 isal_adler32, adler32_base, adler32_sse, adler32_sse, adler32_avx2_4