igzip: Add sse optimized adler32 checksum

Change-Id: Id07727b8a8da4b41aa983b487ca881552d5190ee Signed-off-by: Greg Tucker <greg.b.tucker@intel.com>
2024-12-12 17:33:50 +01:00 · 2017-05-25 13:51:25 -07:00 · 2017-05-25 13:51:25 -07:00 · e1f5284ff8
commit e1f5284ff8
parent 3025e83b91
4 changed files with 253 additions and 2 deletions
--- a/igzip/Makefile.am
+++ b/igzip/Makefile.am
@ -49,6 +49,7 @@ lsrc_x86_64 +=  \
 		igzip/igzip_icf_finish.asm \
 		igzip/rfc1951_lookup.asm \
 		igzip/crc32_gzip.asm igzip/detect_repeated_char.asm \
 		igzip/adler32_sse.asm \
 		igzip/adler32_avx2_4.asm \
 		igzip/igzip_multibinary.asm \
 		igzip/igzip_update_histogram_01.asm \
--- a/igzip/adler32_sse.asm
+++ b/igzip/adler32_sse.asm
@ -0,0 +1,249 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
 ;  are met:
 ;    * Redistributions of source code must retain the above copyright
 ;      notice, this list of conditions and the following disclaimer.
 ;    * Redistributions in binary form must reproduce the above copyright
 ;      notice, this list of conditions and the following disclaimer in
 ;      the documentation and/or other materials provided with the
 ;      distribution.
 ;    * Neither the name of Intel Corporation nor the names of its
 ;      contributors may be used to endorse or promote products derived
 ;      from this software without specific prior written permission.
 ;
 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)
 %define LIMIT 5552
 %define BASE  0xFFF1 ; 65521
 %include "reg_sizes.asm"
 default rel
 [bits 64]
 ; need to keep free: eax, ecx, edx
 %ifidn __OUTPUT_FORMAT__, elf64
 %define arg1   rdi
 %define arg2   rsi
 %define arg3   rdx
 %define init_d edi
 %define data   r9
 %define size   r10
 %define s      r11
 %define a_d    r12d
 %define b_d    r8d
 %define end    r13
 %define func(x) x:
 %macro FUNC_SAVE 0
 	push	r12
 	push	r13
 %endmacro
 %macro FUNC_RESTORE 0
 	pop	r13
 	pop	r12
 %endmacro
 %endif
 %ifidn __OUTPUT_FORMAT__, win64
 %define arg1   rcx
 %define arg2   rdx
 %define arg3   r8
 %define init_d r12d
 %define data   r9
 %define size	r10
 %define s	r11
 %define a_d	esi
 %define b_d	edi
 %define end	r13
 %define stack_size  5*8		; must be an odd multiple of 8
 %define func(x) proc_frame x
 %macro FUNC_SAVE 0
 	alloc_stack	stack_size
 	save_reg	rdi,  0*8
 	save_reg	rsi,  1*8
 	save_reg	r12,  2*8
 	save_reg	r13,  3*8
 	end_prolog
 	mov	init_d, ecx	; initalize init_d from arg1 to keep ecx free
 %endmacro
 %macro FUNC_RESTORE 0
 	mov	rdi,  [rsp + 0*8]
 	mov	rsi,  [rsp + 1*8]
 	mov	r12,  [rsp + 2*8]
 	mov	r13,  [rsp + 3*8]
 	add	rsp, stack_size
 %endmacro
 %endif
 %define xa	xmm0
 %define xb	xmm1
 %define xdata0	xmm2
 %define xdata1	xmm3
 %define xsa	xmm4
 global adler32_sse:function
 func(adler32_sse)
 	FUNC_SAVE
 	mov	data, arg2
 	mov	size, arg3
 	mov	b_d, init_d
 	shr	b_d, 16
 	and	init_d, 0xFFFF
 	cmp	size, 32
 	jb	.lt64
 	movd	xa, init_d
 	pxor	xb, xb
 .sloop1:
 	mov	s, LIMIT
 	cmp	s, size
 	cmova	s, size		; s = min(size, LIMIT)
 	lea	end, [data + s - 7]
 	cmp	data, end
 	jae	.skip_loop_1a
 align 32
 .sloop1a:
 	; do 8 adds
 	pmovzxbd xdata0, [data]
 	pmovzxbd xdata1, [data + 4]
 	add	data, 8
 	paddd	xa, xdata0
 	paddd	xb, xa
 	paddd	xa, xdata1
 	paddd	xb, xa
 	cmp	data, end
 	jb	.sloop1a
 .skip_loop_1a:
 	add	end, 7
 	test	s, 7
 	jnz	.do_final
 	; either we're done, or we just did LIMIT
 	sub	size, s
 	; reduce
 	pslld	xb, 2   ; b is scaled by 4
 	movdqa	xsa, xa ; scaled a
 	pmulld	xsa, [A_SCALE]
 	phaddd	xa, xa
 	phaddd	xb, xb
 	phaddd	xsa, xsa
 	phaddd	xa, xa
 	phaddd	xb, xb
 	phaddd	xsa, xsa
 	movd	eax, xa
 	xor	edx, edx
 	mov	ecx, BASE
 	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
 	mov	a_d, edx
 	psubd	xb, xsa
 	movd	eax, xb
 	add	eax, b_d
 	xor	edx, edx
 	mov	ecx, BASE
 	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
 	mov	b_d, edx
 	test	size, size
 	jz	.finish
 	; continue loop
 	movd	xa, a_d
 	pxor	xb, xb
 	jmp	.sloop1
 .finish:
 	mov	eax, b_d
 	shl	eax, 16
 	or	eax, a_d
 	jmp	.end
 .lt64:
 	mov	a_d, init_d
 	lea	end, [data + size]
 	test	size, size
 	jnz	.final_loop
 	jmp	.zero_size
 	; handle remaining 1...15 bytes
 .do_final:
 	; reduce
 	pslld	xb, 2   ; b is scaled by 4
 	movdqa	xsa, xa ; scaled a
 	pmulld	xsa, [A_SCALE]
 	phaddd	xa, xa
 	phaddd	xb, xb
 	phaddd	xsa, xsa
 	phaddd	xa, xa
 	phaddd	xb, xb
 	phaddd	xsa, xsa
 	psubd	xb, xsa
 	movd	a_d, xa
 	movd	eax, xb
 	add	b_d, eax
 align 32
 .final_loop:
 	movzx	eax, byte[data]
 	add	a_d, eax
 	inc	data
 	add	b_d, a_d
 	cmp	data, end
 	jb	.final_loop
 .zero_size:
 	mov	eax, a_d
 	xor	edx, edx
 	mov	ecx, BASE
 	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
 	mov	a_d, edx
 	mov	eax, b_d
 	xor	edx, edx
 	mov	ecx, BASE
 	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
 	shl	edx, 16
 	or	edx, a_d
 	mov	eax, edx
 .end:
 	FUNC_RESTORE
 	ret
 endproc_frame
 section .data
 align 32
 A_SCALE:
 	dq	0x0000000100000000, 0x0000000300000002
--- a/igzip/encode_df_04.asm
+++ b/igzip/encode_df_04.asm
@ -277,7 +277,7 @@ encode_deflate_icf_ %+ ARCH:
 	;; Check for short codes
 	vptest code_lens2, [min_write_mask]
 	jz	.short_codes
-.short_codes_next
+.short_codes_next:
 	vpermq	codes2, codes2, 0x45
 	vpor	codes1, codes1, codes2
--- a/igzip/igzip_multibinary.asm
+++ b/igzip/igzip_multibinary.asm
@ -69,6 +69,7 @@ extern crc32_gzip_01
 extern adler32_base
 extern adler32_avx2_4
 extern adler32_sse
 section .text
@ -99,4 +100,4 @@ mbin_interface		crc32_gzip
 mbin_dispatch_init5	crc32_gzip, crc32_gzip_base, crc32_gzip_base, crc32_gzip_01, crc32_gzip_01
 mbin_interface		isal_adler32
-mbin_dispatch_init5	isal_adler32, adler32_base, adler32_base, adler32_base, adler32_avx2_4
+mbin_dispatch_init5	isal_adler32, adler32_base, adler32_sse, adler32_sse, adler32_avx2_4