isa-l/igzip/adler32_sse.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)

%define LIMIT 5552
%define BASE  0xFFF1 ; 65521

%include "reg_sizes.asm"

default rel
[bits 64]

; need to keep free: eax, ecx, edx

%ifidn __OUTPUT_FORMAT__, elf64
 %define arg1   rdi
 %define arg2   rsi
 %define arg3   rdx

 %define init_d edi
 %define data   r9
 %define size   r10
 %define s      r11
 %define a_d    r12d
 %define b_d    r8d
 %define end    r13

 %define func(x) x: endbranch
 %macro FUNC_SAVE 0
	push	r12
	push	r13
 %endmacro
%macro FUNC_RESTORE 0
	pop	r13
	pop	r12
 %endmacro
%endif


%ifidn __OUTPUT_FORMAT__, win64
 %define arg1   rcx
 %define arg2   rdx
 %define arg3   r8

 %define init_d r12d
 %define data   r9
 %define size	r10
 %define s	r11
 %define a_d	esi
 %define b_d	edi
 %define end	r13

 %define stack_size  5*8		; must be an odd multiple of 8
 %define func(x) proc_frame x
 %macro FUNC_SAVE 0
	alloc_stack	stack_size
	save_reg	rdi,  0*8
	save_reg	rsi,  1*8
	save_reg	r12,  2*8
	save_reg	r13,  3*8
	end_prolog
	mov	init_d, ecx	; initalize init_d from arg1 to keep ecx free
 %endmacro

 %macro FUNC_RESTORE 0
	mov	rdi,  [rsp + 0*8]
	mov	rsi,  [rsp + 1*8]
	mov	r12,  [rsp + 2*8]
	mov	r13,  [rsp + 3*8]
	add	rsp, stack_size
 %endmacro
%endif

%define xa	xmm0
%define xb	xmm1
%define xdata0	xmm2
%define xdata1	xmm3
%define xsa	xmm4

[bits 64]
default rel
section .text

mk_global adler32_sse, function
func(adler32_sse)
	FUNC_SAVE

	mov	data, arg2
	mov	size, arg3

	mov	b_d, init_d
	shr	b_d, 16
	and	init_d, 0xFFFF
	cmp	size, 32
	jb	.lt64
	movd	xa, init_d
	pxor	xb, xb
.sloop1:
	mov	s, LIMIT
	cmp	s, size
	cmova	s, size		; s = min(size, LIMIT)
	lea	end, [data + s - 7]
	cmp	data, end
	jae	.skip_loop_1a
align 32
.sloop1a:
	; do 8 adds
	pmovzxbd xdata0, [data]
	pmovzxbd xdata1, [data + 4]
	add	data, 8
	paddd	xa, xdata0
	paddd	xb, xa
	paddd	xa, xdata1
	paddd	xb, xa
	cmp	data, end
	jb	.sloop1a

.skip_loop_1a:
	add	end, 7

	test	s, 7
	jnz	.do_final

	; either we're done, or we just did LIMIT
	sub	size, s

	; reduce
	pslld	xb, 2   ; b is scaled by 4
	movdqa	xsa, xa ; scaled a
	pmulld	xsa, [A_SCALE]

	phaddd	xa, xa
	phaddd	xb, xb
	phaddd	xsa, xsa
	phaddd	xa, xa
	phaddd	xb, xb
	phaddd	xsa, xsa

	movd	eax, xa
	xor	edx, edx
	mov	ecx, BASE
	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
	mov	a_d, edx

	psubd	xb, xsa
	movd	eax, xb
	add	eax, b_d
	xor	edx, edx
	mov	ecx, BASE
	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
	mov	b_d, edx

	test	size, size
	jz	.finish

	; continue loop
	movd	xa, a_d
	pxor	xb, xb
	jmp	.sloop1

.finish:
	mov	eax, b_d
	shl	eax, 16
	or	eax, a_d
	jmp	.end

.lt64:
	mov	a_d, init_d
	lea	end, [data + size]
	test	size, size
	jnz	.final_loop
	jmp	.zero_size

	; handle remaining 1...15 bytes
.do_final:
	; reduce
	pslld	xb, 2   ; b is scaled by 4
	movdqa	xsa, xa ; scaled a
	pmulld	xsa, [A_SCALE]

	phaddd	xa, xa
	phaddd	xb, xb
	phaddd	xsa, xsa
	phaddd	xa, xa
	phaddd	xb, xb
	phaddd	xsa, xsa
	psubd	xb, xsa

	movd	a_d, xa
	movd	eax, xb
	add	b_d, eax

align 32
.final_loop:
	movzx	eax, byte[data]
	add	a_d, eax
	inc	data
	add	b_d, a_d
	cmp	data, end
	jb	.final_loop

.zero_size:
	mov	eax, a_d
	xor	edx, edx
	mov	ecx, BASE
	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
	mov	a_d, edx

	mov	eax, b_d
	xor	edx, edx
	mov	ecx, BASE
	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
	shl	edx, 16
	or	edx, a_d
	mov	eax, edx

.end:
	FUNC_RESTORE
	ret

endproc_frame

section .data
align 32
A_SCALE:
	dq	0x0000000100000000, 0x0000000300000002
igzip: Add sse optimized adler32 checksum Change-Id: Id07727b8a8da4b41aa983b487ca881552d5190ee Signed-off-by: Greg Tucker <greg.b.tucker@intel.com> 2017-05-25 22:51:25 +02:00			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
			`; Copyright(c) 2011-2017 Intel Corporation All rights reserved.`
			`;`
			`; Redistribution and use in source and binary forms, with or without`
			`; modification, are permitted provided that the following conditions`
			`; are met:`
			`; * Redistributions of source code must retain the above copyright`
			`; notice, this list of conditions and the following disclaimer.`
			`; * Redistributions in binary form must reproduce the above copyright`
			`; notice, this list of conditions and the following disclaimer in`
			`; the documentation and/or other materials provided with the`
			`; distribution.`
			`; * Neither the name of Intel Corporation nor the names of its`
			`; contributors may be used to endorse or promote products derived`
			`; from this software without specific prior written permission.`
			`;`
			`; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS`
			`; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT`
			`; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR`
			`; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT`
			`; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,`
			`; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT`
			`; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,`
			`; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY`
			`; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`

			`; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)`

			`%define LIMIT 5552`
			`%define BASE 0xFFF1 ; 65521`

			`%include "reg_sizes.asm"`

			`default rel`
			`[bits 64]`

			`; need to keep free: eax, ecx, edx`

			`%ifidn __OUTPUT_FORMAT__, elf64`
			`%define arg1 rdi`
			`%define arg2 rsi`
			`%define arg3 rdx`

			`%define init_d edi`
			`%define data r9`
			`%define size r10`
			`%define s r11`
			`%define a_d r12d`
			`%define b_d r8d`
			`%define end r13`

x86: Add ENDBR32/ENDBR64 at function entries for Intel CET To support Intel CET, all indirect branch targets must start with ENDBR32/ENDBR64. Here is a patch to define endbranch and add it to function entries in x86 assembly codes which are indirect branch targets as discovered by running testsuite on Intel CET machine and visual inspection. Verified with $ CC="gcc -Wl,-z,cet-report=error -fcf-protection" CXX="g++ -Wl,-z,cet-report=error -fcf-protection" .../configure x86_64-linux $ make -j8 $ make -j8 check with both nasm and yasm on both CET and non-CET machines. Change-Id: I9822578e7294fb5043a64ab7de5c41de81a7d337 Signed-off-by: H.J. Lu <hjl.tools@gmail.com> 2020-05-22 19:46:50 +02:00			`%define func(x) x: endbranch`
igzip: Add sse optimized adler32 checksum Change-Id: Id07727b8a8da4b41aa983b487ca881552d5190ee Signed-off-by: Greg Tucker <greg.b.tucker@intel.com> 2017-05-25 22:51:25 +02:00			`%macro FUNC_SAVE 0`
			`push r12`
			`push r13`
			`%endmacro`
			`%macro FUNC_RESTORE 0`
			`pop r13`
			`pop r12`
			`%endmacro`
			`%endif`


			`%ifidn __OUTPUT_FORMAT__, win64`
			`%define arg1 rcx`
			`%define arg2 rdx`
			`%define arg3 r8`

			`%define init_d r12d`
			`%define data r9`
			`%define size r10`
			`%define s r11`
			`%define a_d esi`
			`%define b_d edi`
			`%define end r13`

			`%define stack_size 5*8 ; must be an odd multiple of 8`
			`%define func(x) proc_frame x`
			`%macro FUNC_SAVE 0`
			`alloc_stack stack_size`
			`save_reg rdi, 0*8`
			`save_reg rsi, 1*8`
			`save_reg r12, 2*8`
			`save_reg r13, 3*8`
			`end_prolog`
			`mov init_d, ecx ; initalize init_d from arg1 to keep ecx free`
			`%endmacro`

			`%macro FUNC_RESTORE 0`
			`mov rdi, [rsp + 0*8]`
			`mov rsi, [rsp + 1*8]`
			`mov r12, [rsp + 2*8]`
			`mov r13, [rsp + 3*8]`
			`add rsp, stack_size`
			`%endmacro`
			`%endif`

			`%define xa xmm0`
			`%define xb xmm1`
			`%define xdata0 xmm2`
			`%define xdata1 xmm3`
			`%define xsa xmm4`

build: Fix for windows to allow nasm use Previously windows build could only use yasm because some procedural items such as proc_start were not supported by nasm. This adds a few macros and fixes so nasm can be used to build on windows. Change-Id: Ia05dc3ff482f33b0f915bb1be3c7df5e4a753b3a Signed-off-by: Greg Tucker <greg.b.tucker@intel.com> 2020-03-17 00:23:55 +01:00			`[bits 64]`
			`default rel`
			`section .text`

			`mk_global adler32_sse, function`
igzip: Add sse optimized adler32 checksum Change-Id: Id07727b8a8da4b41aa983b487ca881552d5190ee Signed-off-by: Greg Tucker <greg.b.tucker@intel.com> 2017-05-25 22:51:25 +02:00			`func(adler32_sse)`
			`FUNC_SAVE`

			`mov data, arg2`
			`mov size, arg3`

			`mov b_d, init_d`
			`shr b_d, 16`
			`and init_d, 0xFFFF`
			`cmp size, 32`
			`jb .lt64`
			`movd xa, init_d`
			`pxor xb, xb`
			`.sloop1:`
			`mov s, LIMIT`
			`cmp s, size`
			`cmova s, size ; s = min(size, LIMIT)`
			`lea end, [data + s - 7]`
			`cmp data, end`
			`jae .skip_loop_1a`
			`align 32`
			`.sloop1a:`
			`; do 8 adds`
			`pmovzxbd xdata0, [data]`
			`pmovzxbd xdata1, [data + 4]`
			`add data, 8`
			`paddd xa, xdata0`
			`paddd xb, xa`
			`paddd xa, xdata1`
			`paddd xb, xa`
			`cmp data, end`
			`jb .sloop1a`

			`.skip_loop_1a:`
			`add end, 7`

			`test s, 7`
			`jnz .do_final`

			`; either we're done, or we just did LIMIT`
			`sub size, s`

			`; reduce`
			`pslld xb, 2 ; b is scaled by 4`
			`movdqa xsa, xa ; scaled a`
			`pmulld xsa, [A_SCALE]`

			`phaddd xa, xa`
			`phaddd xb, xb`
			`phaddd xsa, xsa`
			`phaddd xa, xa`
			`phaddd xb, xb`
			`phaddd xsa, xsa`

			`movd eax, xa`
			`xor edx, edx`
			`mov ecx, BASE`
			`div ecx ; divide edx:eax by ecx, quot->eax, rem->edx`
			`mov a_d, edx`

			`psubd xb, xsa`
			`movd eax, xb`
			`add eax, b_d`
			`xor edx, edx`
			`mov ecx, BASE`
			`div ecx ; divide edx:eax by ecx, quot->eax, rem->edx`
			`mov b_d, edx`

			`test size, size`
			`jz .finish`

			`; continue loop`
			`movd xa, a_d`
			`pxor xb, xb`
			`jmp .sloop1`

			`.finish:`
			`mov eax, b_d`
			`shl eax, 16`
			`or eax, a_d`
			`jmp .end`

			`.lt64:`
			`mov a_d, init_d`
			`lea end, [data + size]`
			`test size, size`
			`jnz .final_loop`
			`jmp .zero_size`

			`; handle remaining 1...15 bytes`
			`.do_final:`
			`; reduce`
			`pslld xb, 2 ; b is scaled by 4`
			`movdqa xsa, xa ; scaled a`
			`pmulld xsa, [A_SCALE]`

			`phaddd xa, xa`
			`phaddd xb, xb`
			`phaddd xsa, xsa`
			`phaddd xa, xa`
			`phaddd xb, xb`
			`phaddd xsa, xsa`
			`psubd xb, xsa`

			`movd a_d, xa`
			`movd eax, xb`
			`add b_d, eax`

			`align 32`
			`.final_loop:`
			`movzx eax, byte[data]`
			`add a_d, eax`
			`inc data`
			`add b_d, a_d`
			`cmp data, end`
			`jb .final_loop`

			`.zero_size:`
			`mov eax, a_d`
			`xor edx, edx`
			`mov ecx, BASE`
			`div ecx ; divide edx:eax by ecx, quot->eax, rem->edx`
			`mov a_d, edx`

			`mov eax, b_d`
			`xor edx, edx`
			`mov ecx, BASE`
			`div ecx ; divide edx:eax by ecx, quot->eax, rem->edx`
			`shl edx, 16`
			`or edx, a_d`
			`mov eax, edx`

			`.end:`
			`FUNC_RESTORE`
			`ret`

			`endproc_frame`

			`section .data`
			`align 32`
			`A_SCALE:`
			`dq 0x0000000100000000, 0x0000000300000002`