########################################################################
#  Copyright(c) 2019 Arm Corporation All rights reserved.
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions
#  are met:
#    * Redistributions of source code must retain the above copyright
#      notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in
#      the documentation and/or other materials provided with the
#      distribution.
#    * Neither the name of Arm Corporation nor the names of its
#      contributors may be used to endorse or promote products derived
#      from this software without specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#########################################################################

.macro crc64_refl_func name:req
	.arch armv8-a+crc+crypto
	.text
	.align	3
	.global	\name
	.type	\name, %function

// parameter
x_seed		.req	x0
x_buf		.req	x1
x_len		.req	x2

// return
x_crc_ret	.req	x0

// constant
.equ	FOLD_SIZE, 1024

// global variable
x_buf_end		.req	x3
x_counter		.req	x4
x_buf_iter		.req	x5
x_crc64_tab_addr	.req	x6
w_tmp			.req	w7
x_tmp			.req	x7

// crc64 refl function entry
\name\():
// crc64 for table
	mvn	x_seed, x_seed
	mov	x_counter, 0
	cmp	x_len, (FOLD_SIZE-1)
	bhi	.crc64_clmul_pre
.crc64_tab_pre:
	cmp	x_len, x_counter
	bls	.done

	adrp	x_tmp, .lanchor_crc64_tab
	add	x_buf_iter, x_buf, x_counter
	add	x_buf, x_buf, x_len
	add	x_crc64_tab_addr, x_tmp, :lo12:.lanchor_crc64_tab

	.align 3
.loop_crc64_tab:
	ldrb	w_tmp, [x_buf_iter], 1
	eor	w_tmp, w_tmp, w0
	cmp	x_buf, x_buf_iter
	and	x_tmp, x_tmp, 255
	ldr	x_tmp, [x_crc64_tab_addr, x_tmp, lsl 3]
	eor	x_seed, x_tmp, x_seed, lsr 8
	bne	.loop_crc64_tab
.done:
	mvn	x_crc_ret, x_crc_ret
	ret

// clmul prepare
q_x0		.req	q0
q_x1		.req	q4
q_x2		.req	q6
q_x3		.req	q1

v_x0		.req	v0
v_x1		.req	v4
v_x2		.req	v6
v_x3		.req	v1

d_p4_high	.req	d17
d_p4_low	.req	d7
v_p4_high	.req	v17
v_p4_low	.req	v7

d_y0_tmp	.req	d0
v_y0_tmp	.req	v0

q_tmp		.req	q2
v_tmp		.req	v2

	.align 2
.crc64_clmul_pre:
	ldr	q_tmp, [x_buf]
	ldr	q_x1, [x_buf, 16]
	ldr	q_x2, [x_buf, 32]
	ldr	q_x3, [x_buf, 48]

	and	x_counter, x_len, -64
	sub	x_tmp, x_counter, #64
	cmp	x_tmp, 63

	fmov	d_y0_tmp, x_seed // save crc to d0
	eor	v_x0.16b, v_y0_tmp.16b, v_tmp.16b

	add	x_buf_iter, x_buf, 64
	bls	.clmul_loop_end

	add	x_buf_end, x_buf_iter, x_tmp

	mov	x_tmp, p4_high_b0
	movk	x_tmp, p4_high_b1, lsl 16
	movk	x_tmp, p4_high_b2, lsl 32
	movk	x_tmp, p4_high_b3, lsl 48
	fmov	d_p4_high, x_tmp

	mov	x_tmp, p4_low_b0
	movk	x_tmp, p4_low_b1, lsl 16
	movk	x_tmp, p4_low_b2, lsl 32
	movk	x_tmp, p4_low_b3, lsl 48
	fmov	d_p4_low, x_tmp

// 1024bit --> 512bit loop
// merge x0, x1, x2, x3, y0, y1, y2, y3 => x0, x1, x2, x3 (uint64x2_t)
d_x0_high	.req	d24
d_x1_high	.req	d22
d_x2_high	.req	d20
d_x3_high	.req	d16

v_x0_high	.req	v24
v_x1_high	.req	v22
v_x2_high	.req	v20
v_x3_high	.req	v16

q_x0_tmp	.req	q2
q_x1_tmp	.req	q5
q_x2_tmp	.req	q3
q_x3_tmp	.req	q18

v_x0_tmp	.req	v2
v_x1_tmp	.req	v5
v_x2_tmp	.req	v3
v_x3_tmp	.req	v18

q_x0_tmp	.req	q2
q_x1_tmp	.req	q5
q_x2_tmp	.req	q3
q_x3_tmp	.req	q18

	.align 3
.clmul_loop:
	add	x_buf_iter, x_buf_iter, 64
	cmp	x_buf_iter, x_buf_end

	dup	d_x0_high, v_x0.d[1]
	dup	d_x1_high, v_x1.d[1]
	dup	d_x2_high, v_x2.d[1]
	dup	d_x3_high, v_x3.d[1]

	pmull	v_x0_high.1q, v_x0_high.1d, v_p4_high.1d
	pmull	v_x1_high.1q, v_x1_high.1d, v_p4_high.1d
	pmull	v_x2_high.1q, v_x2_high.1d, v_p4_high.1d
	pmull	v_x3_high.1q, v_x3_high.1d, v_p4_high.1d

	pmull	v_x0.1q, v_x0.1d, v_p4_low.1d
	pmull	v_x1.1q, v_x1.1d, v_p4_low.1d
	pmull	v_x2.1q, v_x2.1d, v_p4_low.1d
	pmull	v_x3.1q, v_x3.1d, v_p4_low.1d

	ldr	q_x0_tmp, [x_buf_iter, -64]
	ldr	q_x1_tmp, [x_buf_iter, -48]
	ldr	q_x2_tmp, [x_buf_iter, -32]
	ldr	q_x3_tmp, [x_buf_iter, -16]

	eor	v_x0_tmp.16b, v_x0_tmp.16b, v_x0_high.16b
	eor	v_x1_tmp.16b, v_x1_tmp.16b, v_x1_high.16b
	eor	v_x2_tmp.16b, v_x2_tmp.16b, v_x2_high.16b
	eor	v_x3_tmp.16b, v_x3_tmp.16b, v_x3_high.16b

	eor	v_x0.16b, v_x0_tmp.16b, v_x0.16b
	eor	v_x1.16b, v_x1_tmp.16b, v_x1.16b
	eor	v_x2.16b, v_x2_tmp.16b, v_x2.16b
	eor	v_x3.16b, v_x3_tmp.16b, v_x3.16b
	bne	.clmul_loop

// folding 512bit --> 128bit
// merge x0, x1, x2, x3 => x3 (uint64x2_t)
// input: x0 -> v_x0, x1 -> v_x1, x2 -> v_x2, x3 -> v_x3
// output: v_x3
d_p1_high	.req	d5
d_p1_low	.req	d3
v_p1_high	.req	v5
v_p1_low	.req	v3

d_tmp_high	.req	d16
d_tmp_low	.req	d2
v_tmp_high	.req	v16
v_tmp_low	.req	v2

.clmul_loop_end:
	mov	x_tmp, p1_high_b0
	movk	x_tmp, p1_high_b1, lsl 16
	movk	x_tmp, p1_high_b2, lsl 32
	movk	x_tmp, p1_high_b3, lsl 48
	fmov	d_p1_high, x_tmp

	mov	x_tmp, p1_low_b0
	movk	x_tmp, p1_low_b1, lsl 16
	movk	x_tmp, p1_low_b2, lsl 32
	movk	x_tmp, p1_low_b3, lsl 48
	fmov	d_p1_low, x_tmp

	dup	d_tmp_high, v_x0.d[1]
	dup	d_tmp_low, v_x0.d[0]

	pmull	v_tmp_high.1q, v_tmp_high.1d, v_p1_high.1d
	pmull	v_tmp_low.1q, v_tmp_low.1d, v_p1_low.1d
	eor	v_tmp_high.16b, v_tmp_high.16b, v_tmp_low.16b
	eor	v_x1.16b, v_tmp_high.16b, v_x1.16b

	dup	d_tmp_high, v_x1.d[1]
	pmull	v_x1.1q, v_x1.1d, v_p1_low.1d
	pmull	v_tmp_high.1q, v_tmp_high.1d, v_p1_high.1d
	eor	v_tmp_high.16b, v_tmp_high.16b, v_x1.16b
	eor	v_x2.16b, v_tmp_high.16b, v_x2.16b

	dup	d_tmp_high, v_x2.d[1]
	pmull	v_x2.1q, v_x2.1d, v_p1_low.1d
	pmull	v_tmp_high.1q, v_tmp_high.1d, v_p1_high.1d
	eor	v_tmp_high.16b, v_tmp_high.16b, v_x2.16b
	eor	v_x3.16b, v_tmp_high.16b, v_x3.16b

// fold 64b
// input: v_x3
// output: v_x3
d_p0_low		.req	d3
v_p0_low		.req	v3
d_x3_low_fold_64b	.req	d2
v_x3_low_fold_64b	.req	v2
v_zero_fold_64b		.req	v0
	mov	x_tmp, p0_low_b0
	movk	x_tmp, p0_low_b1, lsl 16
	movk	x_tmp, p0_low_b2, lsl 32
	movk	x_tmp, p0_low_b3, lsl 48
	fmov	d_p0_low, x_tmp

	dup	d_x3_low_fold_64b, v_x3.d[0]
	movi	v_zero_fold_64b.4s, 0
	ext	v_x3.16b, v_x3.16b, v0.16b, #8

	pmull	v_x3_low_fold_64b.1q, v_x3_low_fold_64b.1d, v_p0_low.1d
	eor	v_x3.16b, v_x3.16b, v_x3_low_fold_64b.16b

// barrett reduction
// input: v_x3
// output: x0
d_br_low	.req	d3
d_br_high	.req	d5
v_br_low	.req	v3
v_br_high	.req	v5
	mov	x0, br_low_b0
	movk	x0, br_low_b1, lsl 16
	movk	x0, br_low_b2, lsl 32
	movk	x0, br_low_b3, lsl 48
	fmov	d_br_low, x0

	mov	x0, br_high_b0
	movk	x0, br_high_b1, lsl 16
	movk	x0, br_high_b2, lsl 32
	movk	x0, br_high_b3, lsl 48
	fmov	d_br_high, x0

	dup	d2, v_x3.d[0]

	pmull	v2.1q, v2.1d, v_br_low.1d
	pmull	v4.1q, v2.1d, v_br_high.1d

	ext	v0.16b, v0.16b, v2.16b, #8

	eor	v0.16b, v0.16b, v4.16b
	eor	v0.16b, v0.16b, v_x3.16b
	umov	x0, v0.d[1]

	b	.crc64_tab_pre

	.size	\name, .-\name
.endm