;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)

%define LIMIT 5552
%define BASE  0xFFF1 ; 65521

%include "reg_sizes.asm"

default rel
[bits 64]

; need to keep free: eax, ecx, edx

%ifidn __OUTPUT_FORMAT__, elf64
 %define arg1   rdi
 %define arg2   rsi
 %define arg3   rdx

 %define init_d edi
 %define data   r9
 %define size   r10
 %define s      r11
 %define a_d    r12d
 %define b_d    r8d
 %define end    r13

 %define func(x) x: endbranch
 %macro FUNC_SAVE 0
	push	r12
	push	r13
 %endmacro
%macro FUNC_RESTORE 0
	pop	r13
	pop	r12
 %endmacro
%endif


%ifidn __OUTPUT_FORMAT__, win64
 %define arg1   rcx
 %define arg2   rdx
 %define arg3   r8

 %define init_d r12d
 %define data   r9
 %define size	r10
 %define s	r11
 %define a_d	esi
 %define b_d	edi
 %define end	r13

 %define stack_size  5*8		; must be an odd multiple of 8
 %define func(x) proc_frame x
 %macro FUNC_SAVE 0
	alloc_stack	stack_size
	save_reg	rdi,  0*8
	save_reg	rsi,  1*8
	save_reg	r12,  2*8
	save_reg	r13,  3*8
	end_prolog
	mov	init_d, ecx	; initalize init_d from arg1 to keep ecx free
 %endmacro

 %macro FUNC_RESTORE 0
	mov	rdi,  [rsp + 0*8]
	mov	rsi,  [rsp + 1*8]
	mov	r12,  [rsp + 2*8]
	mov	r13,  [rsp + 3*8]
	add	rsp, stack_size
 %endmacro
%endif

%define xa	xmm0
%define xb	xmm1
%define xdata0	xmm2
%define xdata1	xmm3
%define xsa	xmm4

[bits 64]
default rel
section .text

mk_global adler32_sse, function
func(adler32_sse)
	FUNC_SAVE

	mov	data, arg2
	mov	size, arg3

	mov	b_d, init_d
	shr	b_d, 16
	and	init_d, 0xFFFF
	cmp	size, 32
	jb	.lt64
	movd	xa, init_d
	pxor	xb, xb
.sloop1:
	mov	s, LIMIT
	cmp	s, size
	cmova	s, size		; s = min(size, LIMIT)
	lea	end, [data + s - 7]
	cmp	data, end
	jae	.skip_loop_1a
align 32
.sloop1a:
	; do 8 adds
	pmovzxbd xdata0, [data]
	pmovzxbd xdata1, [data + 4]
	add	data, 8
	paddd	xa, xdata0
	paddd	xb, xa
	paddd	xa, xdata1
	paddd	xb, xa
	cmp	data, end
	jb	.sloop1a

.skip_loop_1a:
	add	end, 7

	test	s, 7
	jnz	.do_final

	; either we're done, or we just did LIMIT
	sub	size, s

	; reduce
	pslld	xb, 2   ; b is scaled by 4
	movdqa	xsa, xa ; scaled a
	pmulld	xsa, [A_SCALE]

	phaddd	xa, xa
	phaddd	xb, xb
	phaddd	xsa, xsa
	phaddd	xa, xa
	phaddd	xb, xb
	phaddd	xsa, xsa

	movd	eax, xa
	xor	edx, edx
	mov	ecx, BASE
	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
	mov	a_d, edx

	psubd	xb, xsa
	movd	eax, xb
	add	eax, b_d
	xor	edx, edx
	mov	ecx, BASE
	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
	mov	b_d, edx

	test	size, size
	jz	.finish

	; continue loop
	movd	xa, a_d
	pxor	xb, xb
	jmp	.sloop1

.finish:
	mov	eax, b_d
	shl	eax, 16
	or	eax, a_d
	jmp	.end

.lt64:
	mov	a_d, init_d
	lea	end, [data + size]
	test	size, size
	jnz	.final_loop
	jmp	.zero_size

	; handle remaining 1...15 bytes
.do_final:
	; reduce
	pslld	xb, 2   ; b is scaled by 4
	movdqa	xsa, xa ; scaled a
	pmulld	xsa, [A_SCALE]

	phaddd	xa, xa
	phaddd	xb, xb
	phaddd	xsa, xsa
	phaddd	xa, xa
	phaddd	xb, xb
	phaddd	xsa, xsa
	psubd	xb, xsa

	movd	a_d, xa
	movd	eax, xb
	add	b_d, eax

align 32
.final_loop:
	movzx	eax, byte[data]
	add	a_d, eax
	inc	data
	add	b_d, a_d
	cmp	data, end
	jb	.final_loop

.zero_size:
	mov	eax, a_d
	xor	edx, edx
	mov	ecx, BASE
	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
	mov	a_d, edx

	mov	eax, b_d
	xor	edx, edx
	mov	ecx, BASE
	div	ecx		; divide edx:eax by ecx, quot->eax, rem->edx
	shl	edx, 16
	or	edx, a_d
	mov	eax, edx

.end:
	FUNC_RESTORE
	ret

endproc_frame

section .data
align 32
A_SCALE:
	dq	0x0000000100000000, 0x0000000300000002