From e1f5284ff8dfe7c1c366e6c2ed712b71e4700634 Mon Sep 17 00:00:00 2001
From: Greg Tucker <greg.b.tucker@intel.com>
Date: Thu, 25 May 2017 13:51:25 -0700
Subject: [PATCH] igzip: Add sse optimized adler32 checksum

Change-Id: Id07727b8a8da4b41aa983b487ca881552d5190ee
Signed-off-by: Greg Tucker <greg.b.tucker@intel.com>
---
 igzip/Makefile.am           |   1 +
 igzip/adler32_sse.asm       | 249 ++++++++++++++++++++++++++++++++++++
 igzip/encode_df_04.asm      |   2 +-
 igzip/igzip_multibinary.asm |   3 +-
 4 files changed, 253 insertions(+), 2 deletions(-)
 create mode 100644 igzip/adler32_sse.asm

diff --git a/igzip/Makefile.am b/igzip/Makefile.am
index 90490a2..7263098 100644
--- a/igzip/Makefile.am
+++ b/igzip/Makefile.am
@@ -49,6 +49,7 @@ lsrc_x86_64 +=  \
 		igzip/igzip_icf_finish.asm \
 		igzip/rfc1951_lookup.asm \
 		igzip/crc32_gzip.asm igzip/detect_repeated_char.asm \
+		igzip/adler32_sse.asm \
 		igzip/adler32_avx2_4.asm \
 		igzip/igzip_multibinary.asm \
 		igzip/igzip_update_histogram_01.asm \
diff --git a/igzip/adler32_sse.asm b/igzip/adler32_sse.asm
new file mode 100644
index 0000000..92c5327
--- /dev/null
+++ b/igzip/adler32_sse.asm
@@ -0,0 +1,249 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)
+
+%define LIMIT 5552
+%define BASE  0xFFF1 ; 65521
+
+%include "reg_sizes.asm"
+
+default rel
+[bits 64]
+
+; need to keep free: eax, ecx, edx
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg1   rdi
+ %define arg2   rsi
+ %define arg3   rdx
+
+ %define init_d edi
+ %define data   r9
+ %define size   r10
+ %define s      r11
+ %define a_d    r12d
+ %define b_d    r8d
+ %define end    r13
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+ %endmacro
+%macro FUNC_RESTORE 0
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1   rcx
+ %define arg2   rdx
+ %define arg3   r8
+
+ %define init_d r12d
+ %define data   r9
+ %define size	r10
+ %define s	r11
+ %define a_d	esi
+ %define b_d	edi
+ %define end	r13
+
+ %define stack_size  5*8		; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_reg	rdi,  0*8
+	save_reg	rsi,  1*8
+	save_reg	r12,  2*8
+	save_reg	r13,  3*8
+	end_prolog
+	mov	init_d, ecx	; initalize init_d from arg1 to keep ecx free
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	mov	rdi,  [rsp + 0*8]
+	mov	rsi,  [rsp + 1*8]
+	mov	r12,  [rsp + 2*8]
+	mov	r13,  [rsp + 3*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define xa	xmm0
+%define xb	xmm1
+%define xdata0	xmm2
+%define xdata1	xmm3
+%define xsa	xmm4
+
+global adler32_sse:function
+func(adler32_sse)
+	FUNC_SAVE
+
+	mov	data, arg2
+	mov	size, arg3
+
+	mov	b_d, init_d
+	shr	b_d, 16
+	and	init_d, 0xFFFF
+	cmp	size, 32
+	jb	.lt64
+	movd	xa, init_d
+	pxor	xb, xb
+.sloop1:
+	mov	s, LIMIT
+	cmp	s, size
+	cmova	s, size		; s = min(size, LIMIT)
+	lea	end, [data + s - 7]
+	cmp	data, end
+	jae	.skip_loop_1a
+align 32
+.sloop1a:
+	; do 8 adds
+	pmovzxbd xdata0, [data]
+	pmovzxbd xdata1, [data + 4]
+	add	data, 8
+	paddd	xa, xdata0
+	paddd	xb, xa
+	paddd	xa, xdata1
+	paddd	xb, xa
+	cmp	data, end
+	jb	.sloop1a
+
+.skip_loop_1a:
+	add	end, 7
+
+	test	s, 7
+	jnz	.do_final
+
+	; either we're done, or we just did LIMIT
+	sub	size, s
+
+	; reduce
+	pslld	xb, 2   ; b is scaled by 4
+	movdqa	xsa, xa ; scaled a
+	pmulld	xsa, [A_SCALE]
+
+	phaddd	xa, xa
+	phaddd	xb, xb
+	phaddd	xsa, xsa
+	phaddd	xa, xa
+	phaddd	xb, xb
+	phaddd	xsa, xsa
+
+	movd	eax, xa
+	xor	edx, edx
+	mov	ecx, BASE
+	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
+	mov	a_d, edx
+
+	psubd	xb, xsa
+	movd	eax, xb
+	add	eax, b_d
+	xor	edx, edx
+	mov	ecx, BASE
+	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
+	mov	b_d, edx
+
+	test	size, size
+	jz	.finish
+
+	; continue loop
+	movd	xa, a_d
+	pxor	xb, xb
+	jmp	.sloop1
+
+.finish:
+	mov	eax, b_d
+	shl	eax, 16
+	or	eax, a_d
+	jmp	.end
+
+.lt64:
+	mov	a_d, init_d
+	lea	end, [data + size]
+	test	size, size
+	jnz	.final_loop
+	jmp	.zero_size
+
+	; handle remaining 1...15 bytes
+.do_final:
+	; reduce
+	pslld	xb, 2   ; b is scaled by 4
+	movdqa	xsa, xa ; scaled a
+	pmulld	xsa, [A_SCALE]
+
+	phaddd	xa, xa
+	phaddd	xb, xb
+	phaddd	xsa, xsa
+	phaddd	xa, xa
+	phaddd	xb, xb
+	phaddd	xsa, xsa
+	psubd	xb, xsa
+
+	movd	a_d, xa
+	movd	eax, xb
+	add	b_d, eax
+
+align 32
+.final_loop:
+	movzx	eax, byte[data]
+	add	a_d, eax
+	inc	data
+	add	b_d, a_d
+	cmp	data, end
+	jb	.final_loop
+
+.zero_size:
+	mov	eax, a_d
+	xor	edx, edx
+	mov	ecx, BASE
+	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
+	mov	a_d, edx
+
+	mov	eax, b_d
+	xor	edx, edx
+	mov	ecx, BASE
+	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
+	shl	edx, 16
+	or	edx, a_d
+	mov	eax, edx
+
+.end:
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+align 32
+A_SCALE:
+	dq	0x0000000100000000, 0x0000000300000002
diff --git a/igzip/encode_df_04.asm b/igzip/encode_df_04.asm
index 2e60867..44b4755 100644
--- a/igzip/encode_df_04.asm
+++ b/igzip/encode_df_04.asm
@@ -277,7 +277,7 @@ encode_deflate_icf_ %+ ARCH:
 	;; Check for short codes
 	vptest code_lens2, [min_write_mask]
 	jz	.short_codes
-.short_codes_next
+.short_codes_next:
 
 	vpermq	codes2, codes2, 0x45
 	vpor	codes1, codes1, codes2
diff --git a/igzip/igzip_multibinary.asm b/igzip/igzip_multibinary.asm
index 77d984a..516be0e 100644
--- a/igzip/igzip_multibinary.asm
+++ b/igzip/igzip_multibinary.asm
@@ -69,6 +69,7 @@ extern crc32_gzip_01
 
 extern adler32_base
 extern adler32_avx2_4
+extern adler32_sse
 
 section .text
 
@@ -99,4 +100,4 @@ mbin_interface		crc32_gzip
 mbin_dispatch_init5	crc32_gzip, crc32_gzip_base, crc32_gzip_base, crc32_gzip_01, crc32_gzip_01
 
 mbin_interface		isal_adler32
-mbin_dispatch_init5	isal_adler32, adler32_base, adler32_base, adler32_base, adler32_avx2_4
+mbin_dispatch_init5	isal_adler32, adler32_base, adler32_sse, adler32_sse, adler32_avx2_4