########################################################################
#  Copyright (c) 2019 Microsoft Corporation.
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions
#  are met:
#    * Redistributions of source code must retain the above copyright
#      notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in
#      the documentation and/or other materials provided with the
#      distribution.
#    * Neither the name of Microsoft Corporation nor the names of its
#      contributors may be used to endorse or promote products derived
#      from this software without specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#########################################################################

#include "../include/aarch64_label.h"

// parameters
#define w_seed          w0
#define x_seed          x0
#define x_buf           x1
#define w_len           w2
#define x_len           x2

// return
#define w_crc_ret       w0
#define x_crc_ret       x0

// constant
#define FOLD_SIZE       64

// global variables
#define x_buf_end       x3
#define w_counter       w4
#define x_counter       x4
#define x_buf_iter      x5
#define x_crc_tab_addr  x6
#define x_tmp2          x6
#define w_tmp           w7
#define x_tmp           x7

#define v_x0            v0
#define d_x0            d0
#define s_x0            s0

#define q_x1            q1
#define v_x1            v1

#define q_x2            q2
#define v_x2            v2

#define q_x3            q3
#define v_x3            v3
#define d_x3            d3
#define s_x3            s3

#define q_y0            q4
#define v_y0            v4
#define v_tmp_high      v4
#define d_tmp_high      d4

#define q_y1            q5
#define v_y1            v5
#define v_tmp_low       v5

#define q_y2            q6
#define v_y2            v6

#define q_y3            q7
#define v_y3            v7

#define q_x0_tmp        q30
#define v_x0_tmp        v30
#define d_p4_high       v30.d[1]
#define d_p4_low        d30
#define v_p4            v30
#define d_p1_high       v30.d[1]
#define d_p1_low        d30
#define v_p1            v30
#define d_p0_high       v30.d[1]
#define d_p0_low        d30
#define v_p0            v30
#define d_br_low        d30
#define d_br_low2       v30.d[1]
#define v_br_low        v30

#define q_shuffle       q31
#define v_shuffle       v31
#define d_br_high       d31
#define d_br_high2      v31.d[1]
#define v_br_high       v31
#define d_p0_low2       d31
#define d_p0_high2      v31.d[1]
#define v_p02           v31

#define v_x0_high       v16
#define v_x1_high       v17
#define v_x2_high       v18
#define v_x3_high       v19

.macro crc_refl_load_first_block
	ldr	q_x0_tmp, [x_buf]
	ldr	q_x1, [x_buf, 16]
	ldr	q_x2, [x_buf, 32]
	ldr	q_x3, [x_buf, 48]

	and	x_counter, x_len, -64
	sub	x_tmp, x_counter, #64
	cmp	x_tmp, 63

	add	x_buf_iter, x_buf, 64

	eor	v_x0.16b, v_x0.16b, v_x0_tmp.16b
.endm

.macro crc_norm_load_first_block
#ifndef __APPLE__
	adrp	x_tmp, .shuffle_data
	ldr	q_shuffle, [x_tmp, #:lo12:.shuffle_data]
#else
	adrp	x_tmp, .shuffle_data@PAGE
	ldr	q_shuffle, [x_tmp, #.shuffle_data@PAGEOFF]
#endif

	ldr	q_x0_tmp, [x_buf]
	ldr	q_x1, [x_buf, 16]
	ldr	q_x2, [x_buf, 32]
	ldr	q_x3, [x_buf, 48]

	and	x_counter, x_len, -64
	sub	x_tmp, x_counter, #64
	cmp	x_tmp, 63

	add	x_buf_iter, x_buf, 64

	tbl	v_x0_tmp.16b, {v_x0_tmp.16b}, v_shuffle.16b
	tbl	v_x1.16b, {v_x1.16b}, v_shuffle.16b
	tbl	v_x2.16b, {v_x2.16b}, v_shuffle.16b
	tbl	v_x3.16b, {v_x3.16b}, v_shuffle.16b

	eor	v_x0.16b, v_x0.16b, v_x0_tmp.16b
.endm

.macro crc32_load_p4
	add	x_buf_end, x_buf_iter, x_tmp

	mov	x_tmp, p4_low_b0
	movk	x_tmp, p4_low_b1, lsl 16
	fmov	d_p4_low, x_tmp

	mov	x_tmp2, p4_high_b0
	movk	x_tmp2, p4_high_b1, lsl 16
	fmov	d_p4_high, x_tmp2
.endm

.macro crc64_load_p4
	add	x_buf_end, x_buf_iter, x_tmp

	mov	x_tmp, p4_low_b0
	movk	x_tmp, p4_low_b1, lsl 16
	movk	x_tmp, p4_low_b2, lsl 32
	movk	x_tmp, p4_low_b3, lsl 48
	fmov	d_p4_low, x_tmp

	mov	x_tmp2, p4_high_b0
	movk	x_tmp2, p4_high_b1, lsl 16
	movk	x_tmp2, p4_high_b2, lsl 32
	movk	x_tmp2, p4_high_b3, lsl 48
	fmov	d_p4_high, x_tmp2
.endm

.macro crc_refl_loop
	.align 3
.clmul_loop:
	// interleave ldr and pmull(2) for arch which can only issue quadword load every
	// other cycle (i.e. A55)
	ldr	q_y0, [x_buf_iter]
	pmull2	v_x0_high.1q, v_x0.2d, v_p4.2d
	ldr	q_y1, [x_buf_iter, 16]
	pmull2	v_x1_high.1q, v_x1.2d, v_p4.2d
	ldr	q_y2, [x_buf_iter, 32]
	pmull2	v_x2_high.1q, v_x2.2d, v_p4.2d
	ldr	q_y3, [x_buf_iter, 48]
	pmull2	v_x3_high.1q, v_x3.2d, v_p4.2d

	pmull	v_x0.1q, v_x0.1d, v_p4.1d
	add	x_buf_iter, x_buf_iter, 64
	pmull	v_x1.1q, v_x1.1d, v_p4.1d
	cmp	x_buf_iter, x_buf_end
	pmull	v_x2.1q, v_x2.1d, v_p4.1d
	pmull	v_x3.1q, v_x3.1d, v_p4.1d

	eor	v_x0.16b, v_x0_high.16b, v_x0.16b
	eor	v_x0.16b, v_x0.16b, v_y0.16b

	eor	v_x1.16b, v_x1_high.16b, v_x1.16b
	eor	v_x1.16b, v_x1.16b, v_y1.16b

	eor	v_x2.16b, v_x2_high.16b, v_x2.16b
	eor	v_x2.16b, v_x2.16b, v_y2.16b

	eor	v_x3.16b, v_x3_high.16b, v_x3.16b
	eor	v_x3.16b, v_x3.16b, v_y3.16b
	bne	.clmul_loop
.endm

.macro crc_norm_loop
	.align 3
.clmul_loop:
	// interleave ldr and pmull(2) for arch which can only issue quadword load every
	// other cycle (i.e. A55)
	ldr	q_y0, [x_buf_iter]
	pmull2	v_x0_high.1q, v_x0.2d, v_p4.2d
	ldr	q_y1, [x_buf_iter, 16]
	pmull2	v_x1_high.1q, v_x1.2d, v_p4.2d
	ldr	q_y2, [x_buf_iter, 32]
	pmull2	v_x2_high.1q, v_x2.2d, v_p4.2d
	ldr	q_y3, [x_buf_iter, 48]
	pmull2	v_x3_high.1q, v_x3.2d, v_p4.2d

	pmull	v_x0.1q, v_x0.1d, v_p4.1d
	add	x_buf_iter, x_buf_iter, 64
	pmull	v_x1.1q, v_x1.1d, v_p4.1d
	cmp	x_buf_iter, x_buf_end
	pmull	v_x2.1q, v_x2.1d, v_p4.1d
	pmull	v_x3.1q, v_x3.1d, v_p4.1d

	tbl	v_y0.16b, {v_y0.16b}, v_shuffle.16b
	tbl	v_y1.16b, {v_y1.16b}, v_shuffle.16b
	tbl	v_y2.16b, {v_y2.16b}, v_shuffle.16b
	tbl	v_y3.16b, {v_y3.16b}, v_shuffle.16b

	eor	v_x0.16b, v_x0.16b, v_x0_high.16b
	eor	v_x1.16b, v_x1.16b, v_x1_high.16b
	eor	v_x2.16b, v_x2.16b, v_x2_high.16b
	eor	v_x3.16b, v_x3.16b, v_x3_high.16b

	eor	v_x0.16b, v_x0.16b, v_y0.16b
	eor	v_x1.16b, v_x1.16b, v_y1.16b
	eor	v_x2.16b, v_x2.16b, v_y2.16b
	eor	v_x3.16b, v_x3.16b, v_y3.16b
	bne	.clmul_loop
.endm

.macro crc32_fold_512b_to_128b
	mov	x_tmp, p1_low_b0
	movk	x_tmp, p1_low_b1, lsl 16
	fmov	d_p1_low, x_tmp

	mov	x_tmp2, p1_high_b0
	movk	x_tmp2, p1_high_b1, lsl 16
	fmov	d_p1_high, x_tmp2

	pmull2	v_tmp_high.1q, v_x0.2d, v_p1.2d
	pmull	v_tmp_low.1q, v_x0.1d, v_p1.1d
	eor	v_x1.16b, v_x1.16b, v_tmp_high.16b
	eor	v_x1.16b, v_x1.16b, v_tmp_low.16b

	pmull2	v_tmp_high.1q, v_x1.2d, v_p1.2d
	pmull	v_tmp_low.1q, v_x1.1d, v_p1.1d
	eor	v_x2.16b, v_x2.16b, v_tmp_high.16b
	eor	v_x2.16b, v_x2.16b, v_tmp_low.16b

	pmull2	v_tmp_high.1q, v_x2.2d, v_p1.2d
	pmull	v_tmp_low.1q, v_x2.1d, v_p1.1d
	eor	v_x3.16b, v_x3.16b, v_tmp_high.16b
	eor	v_x3.16b, v_x3.16b, v_tmp_low.16b
.endm

.macro crc64_fold_512b_to_128b
	mov	x_tmp, p1_low_b0
	movk	x_tmp, p1_low_b1, lsl 16
	movk	x_tmp, p1_low_b2, lsl 32
	movk	x_tmp, p1_low_b3, lsl 48
	fmov	d_p1_low, x_tmp

	mov	x_tmp2, p1_high_b0
	movk	x_tmp2, p1_high_b1, lsl 16
	movk	x_tmp2, p1_high_b2, lsl 32
	movk	x_tmp2, p1_high_b3, lsl 48
	fmov	d_p1_high, x_tmp2

	pmull2	v_tmp_high.1q, v_x0.2d, v_p1.2d
	pmull	v_tmp_low.1q, v_x0.1d, v_p1.1d
	eor	v_x1.16b, v_x1.16b, v_tmp_high.16b
	eor	v_x1.16b, v_x1.16b, v_tmp_low.16b

	pmull2	v_tmp_high.1q, v_x1.2d, v_p1.2d
	pmull	v_tmp_low.1q, v_x1.1d, v_p1.1d
	eor	v_x2.16b, v_x2.16b, v_tmp_high.16b
	eor	v_x2.16b, v_x2.16b, v_tmp_low.16b

	pmull2	v_tmp_high.1q, v_x2.2d, v_p1.2d
	pmull	v_tmp_low.1q, v_x2.1d, v_p1.1d
	eor	v_x3.16b, v_x3.16b, v_tmp_high.16b
	eor	v_x3.16b, v_x3.16b, v_tmp_low.16b
.endm