isa-l/crc/aarch64/crc32_refl_common_pmull.h

########################################################################
#  Copyright(c) 2019 Arm Corporation All rights reserved.
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions
#  are met:
#    * Redistributions of source code must retain the above copyright
#      notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in
#      the documentation and/or other materials provided with the
#      distribution.
#    * Neither the name of Arm Corporation nor the names of its
#      contributors may be used to endorse or promote products derived
#      from this software without specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#########################################################################

.macro crc32_refl_func name:req
	.arch armv8-a+crc+crypto
	.text
	.align	3
	.global	\name
	.type	\name, %function

/* crc32_refl_func(uint32_t seed, uint8_t * buf, uint64_t len) */

// constant
.equ	FOLD_SIZE, 1024

// paramter
w_seed		.req	w0
x_seed		.req	x0
x_buf		.req	x1
x_len		.req	x2

x_buf_tmp	.req	x0

// crc32 refl function entry
\name\():
	mvn	w_seed, w_seed
	mov	x3, 0
	mov	w4, 0
	cmp	x_len, (FOLD_SIZE - 1)
	bhi	.crc32_clmul_pre

.crc32_refl_tab_pre:
	cmp	x_len, x3
	bls	.done
	sxtw	x4, w4
	adrp	x5, .LANCHOR0
	sub	x_buf, x_buf, x4
	add	x5, x5, :lo12:.LANCHOR0

	.align 3
.loop_crc32_refl_tab:
	ldrb	w3, [x_buf, x4]
	add	x4, x4, 1
	cmp	x_len, x4
	eor	x3, x_seed, x3
	and	x3, x3, 255
	ldr	w3, [x5, x3, lsl 2]
	eor	w_seed, w3, w_seed, lsr 8
	bhi	.loop_crc32_refl_tab
.done:
	mvn	w_seed, w_seed
	ret

d_y0_tmp	.req	d0
v_y0_tmp	.req	v0

q_x0_tmp	.req	q3
v_x0_tmp	.req	v3

v_x0		.req	v0
q_x1		.req	q2
q_x2		.req	q4
q_x3		.req	q1

d_p4_low	.req	d17
d_p4_high	.req	d19

x_buf_end	.req	x3
	.align 2
.crc32_clmul_pre:
	and	x4, x_len, -64
	uxtw	x_seed, w_seed
	cmp	x4, 63
	bls	.clmul_end

	fmov	d_y0_tmp, x_seed
	ins	v_y0_tmp.d[1], x3

	ldr	q_x0_tmp, [x_buf]
	ldr	q_x1, [x_buf, 16]
	ldr	q_x2, [x_buf, 32]
	ldr	q_x3, [x_buf, 48]
	eor	v_x0.16b, v_y0_tmp.16b, v_x0_tmp.16b

	sub	x5, x4, #64
	cmp	x5, 63

	add	x_buf_tmp, x_buf, 64
	bls	.clmul_loop_end

	mov	x4, p4_high_b0
	movk	x4, p4_high_b1, lsl 16
	fmov	d_p4_high, x4

	mov	x4, p4_low_b0
	movk	x4, p4_low_b1, lsl 16
	fmov	d_p4_low, x4

	add	x_buf_end, x_buf_tmp, x5

v_p4_low	.req	v17
v_p4_high	.req	v19

// v_x0		.req	v0
v_x1		.req	v2
v_x2		.req	v4
v_x3		.req	v1

q_y0		.req	q7
q_y1		.req	q5
q_y2		.req	q3
q_y3		.req	q21

v_y0		.req	v7
v_y1		.req	v5
v_y2		.req	v3
v_y3		.req	v21

d_x0_h		.req	d22
d_x1_h		.req	d20
d_x2_h		.req	d18
d_x3_h		.req	d6

v_x0_h		.req	v22
v_x1_h		.req	v20
v_x2_h		.req	v18
v_x3_h		.req	v6

	.align 3
.clmul_loop:
	add	x_buf_tmp, x_buf_tmp, 64
	cmp	x_buf_tmp, x_buf_end

	dup	d_x0_h, v_x0.d[1]
	dup	d_x1_h, v_x1.d[1]
	dup	d_x2_h, v_x2.d[1]
	dup	d_x3_h, v_x3.d[1]

	ldr	q_y0, [x_buf_tmp, -64]
	ldr	q_y1, [x_buf_tmp, -48]
	ldr	q_y2, [x_buf_tmp, -32]
	ldr	q_y3, [x_buf_tmp, -16]

	pmull	v_x0.1q, v_x0.1d, v_p4_low.1d
	pmull	v_x1.1q, v_x1.1d, v_p4_low.1d
	pmull	v_x2.1q, v_x2.1d, v_p4_low.1d
	pmull	v_x3.1q, v_x3.1d, v_p4_low.1d

	pmull	v_x0_h.1q, v_x0_h.1d, v_p4_high.1d
	pmull	v_x1_h.1q, v_x1_h.1d, v_p4_high.1d
	pmull	v_x2_h.1q, v_x2_h.1d, v_p4_high.1d
	pmull	v_x3_h.1q, v_x3_h.1d, v_p4_high.1d

	eor	v_y0.16b, v_y0.16b, v22.16b
	eor	v_y1.16b, v_y1.16b, v20.16b
	eor	v_y2.16b, v_y2.16b, v18.16b
	eor	v_y3.16b, v_y3.16b, v6.16b

	eor	v_x0.16b, v_y0.16b, v_x0.16b
	eor	v_x1.16b, v_y1.16b, v_x1.16b
	eor	v_x2.16b, v_y2.16b, v_x2.16b
	eor	v_x3.16b, v_y3.16b, v_x3.16b

	bne	.clmul_loop


// v_x0		.req	v0
// v_x1		.req	v2
// v_x2		.req	v4
// v_x3		.req	v1

d_x0		.req	d0

d_p1_high	.req	d7
d_p1_low	.req	d17

v_p1_high	.req	v7
v_p1_low	.req	v17

.clmul_loop_end:
// fold 128b
	mov	x0, p1_high_b0
	movk	x0, p1_high_b1, lsl 16
	fmov	d_p1_high, x0

	mov	x0, p1_low_b0
	movk	x0, p1_low_b1, lsl 16
	fmov	d_p1_low, x0

	dup	d6, v_x0.d[1]
	pmull	v_x0.1q, v_x0.1d, v_p1_low.1d
	pmull	v6.1q, v6.1d, v_p1_high.1d
	eor	v6.16b, v6.16b, v_x0.16b
	eor	v_x1.16b, v6.16b, v_x1.16b

	dup	d6, v_x1.d[1]
	pmull	v_x1.1q, v_x1.1d, v_p1_low.1d
	pmull	v6.1q, v6.1d, v_p1_high.1d
	eor	v6.16b, v6.16b, v_x1.16b
	eor	v_x2.16b, v6.16b, v_x2.16b

	dup	d_x0, v_x2.d[1] // d_x0 temparory saved v_x2 high
	pmull	v_x2.1q, v_x2.1d, v_p1_low.1d
	pmull	v_x0.1q, v_x0.1d, v_p1_high.1d
	eor	v_x0.16b, v_x0.16b, v_x2.16b
	eor	v_x0.16b, v_x0.16b, v_x3.16b

// all
	mov	x0, 4294967295
	fmov	d3, x0

	movi	v5.4s, 0

// fold 64b
	mov	x4, p0_low_b0
	movk	x4, p0_low_b1, lsl 16
	fmov	d1, x4

	dup	d2, v0.d[0]
	ext	v0.16b, v0.16b, v5.16b, #8
	pmull	v2.1q, v2.1d, v7.1d
	eor	v0.16b, v0.16b, v2.16b
	and	v2.16b, v3.16b, v0.16b
	ext	v0.16b, v0.16b, v5.16b, #4
	pmull	v2.1q, v2.1d, v1.1d

// barrett reduction
	mov	x3, br_high_b0
	movk	x3, br_high_b1, lsl 16
	movk	x3, br_high_b2, lsl 32

	fmov	d1, x3
	eor	v0.16b, v0.16b, v2.16b
	and	v2.16b, v0.16b, v3.16b
	pmull	v2.1q, v2.1d, v1.1d

	mov	x0, br_low_b0
	movk	x0, br_low_b1, lsl 16
	movk	x0, br_low_b2, lsl 32

	fmov	d1, x0
	and	v2.16b, v3.16b, v2.16b
	pmull	v2.1q, v2.1d, v1.1d
	eor	v0.16b, v0.16b, v2.16b
	umov	w_seed, v0.s[1]
	uxtw	x_seed, w_seed

.clmul_end:
	and	w4, w2, -64
	sxtw	x3, w4
	add	x_buf, x_buf, x3
	b	.crc32_refl_tab_pre
	.size	\name, .-\name
.endm