isa-l/crc/aarch64/crc32_norm_common_pmull.h

########################################################################
#  Copyright(c) 2019 Arm Corporation All rights reserved.
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions
#  are met:
#    * Redistributions of source code must retain the above copyright
#      notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in
#      the documentation and/or other materials provided with the
#      distribution.
#    * Neither the name of Arm Corporation nor the names of its
#      contributors may be used to endorse or promote products derived
#      from this software without specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#########################################################################

.macro crc32_norm_func name:req
	.arch armv8-a+crc+crypto
	.text
	.align	3
	.global	\name
	.type	\name, %function

/* crc32_norm_func(uint32_t seed, uint8_t * buf, uint64_t len) */

// constant
.equ	FOLD_SIZE, 1024

// parameter
w_seed		.req	w0
x_seed		.req	x0
x_buf		.req	x1
x_len		.req	x2

x_buf_tmp	.req	x0

// crc32 normal function entry
\name\():
	mvn	w_seed, w_seed
	mov	x3, 0
	mov	w4, 0
	cmp	x_len, (FOLD_SIZE - 1)
	uxtw	x_seed, w_seed
	bhi	.crc32_clmul_pre

.crc32_norm_tab_pre:
	cmp	x_len, x3
	bls	.done

	sxtw	x4, w4
	adrp	x5, .LANCHOR0
	sub	x_buf, x_buf, x4
	add	x5, x5, :lo12:.LANCHOR0

	.align 3
.loop_crc32_norm_tab:
	ldrb	w3, [x_buf, x4]
	add	x4, x4, 1
	cmp	x_len, x4
	eor	x3, x3, x0, lsr 24
	and	x3, x3, 255
	ldr	w3, [x5, x3, lsl 2]
	eor	x0, x3, x0, lsl 8
	bhi	.loop_crc32_norm_tab

.done:
	mvn	w_seed, w_seed
	ret

// crcc32 clmul prepare

x_buf_end	.req	x3
q_shuffle	.req	q3
v_shuffle	.req	v3

q_x0_tmp	.req	q5
q_x1		.req	q6
q_x2		.req	q4
q_x3		.req	q1

v_x0_tmp	.req	v5
v_x0		.req	v2
v_x1		.req	v6
v_x2		.req	v4
v_x3		.req	v1

d_p4_high	.req	d7
d_p4_low	.req	d5

	.align 2
.crc32_clmul_pre:
	and	x3, x_len, -64
	cmp	x3, 63
	bls	.clmul_end

	lsl	x_seed, x_seed, 32
	movi	v2.4s, 0
	ins	v2.d[1], x_seed

	adrp	x4, .shuffle
	ldr	q_shuffle, [x4, #:lo12:.shuffle]

	sub	x4, x3, #64
	cmp	x4, 63

	ldr	q_x0_tmp, [x_buf]
	ldr	q_x1, [x_buf, 16]
	ldr	q_x2, [x_buf, 32]
	ldr	q_x3, [x_buf, 48]
	add	x_buf_tmp, x_buf, 64

	tbl	v_x0_tmp.16b, {v_x0_tmp.16b}, v_shuffle.16b
	tbl	v_x1.16b, {v_x1.16b}, v_shuffle.16b
	tbl	v_x2.16b, {v_x2.16b}, v_shuffle.16b
	tbl	v_x3.16b, {v_x3.16b}, v_shuffle.16b
	eor	v_x0.16b, v_x0.16b, v_x0_tmp.16b
	bls	.clmul_loop_end

	add	x_buf_end, x_buf_tmp, x4
	mov	x4, p4_high_b0
	movk	x4, p4_high_b1, lsl 16
	fmov	d_p4_high, x4

	mov	x4, p4_low_b0
	movk	x4, p4_low_b1, lsl 16
	fmov	d_p4_low, x4

// crc32 clmul loop
//v_x0		.req	v2
//v_x1		.req	v6
//v_x2		.req	v4
//v_x3		.req	v1

d_x0_high	.req	d22
d_x1_high	.req	d20
d_x2_high	.req	d18
d_x3_high	.req	d16

v_x0_high	.req	v22
v_x1_high	.req	v20
v_x2_high	.req	v18
v_x3_high	.req	v16

q_y0_high	.req	q23
q_y1_high	.req	q21
q_y2_high	.req	q19
q_y3_high	.req	q17

v_y0_high	.req	v23
v_y1_high	.req	v21
v_y2_high	.req	v19
v_y3_high	.req	v17

v_p4_high	.req	v7
v_p4_low	.req	v5
//v_shuffle	.req	v3

	.align 3
.clmul_loop:
	dup	d_x0_high, v_x0.d[1]
	dup	d_x1_high, v_x1.d[1]
	dup	d_x2_high, v_x2.d[1]
	dup	d_x3_high, v_x3.d[1]

	add	x_buf_tmp, x_buf_tmp, 64

	ldr	q_y0_high, [x_buf_tmp, -64]
	ldr	q_y1_high, [x_buf_tmp, -48]
	ldr	q_y2_high, [x_buf_tmp, -32]
	ldr	q_y3_high, [x_buf_tmp, -16]

	cmp	x_buf_tmp, x_buf_end

	pmull	v_x0.1q, v_x0.1d, v_p4_low.1d
	pmull	v_x1.1q, v_x1.1d, v_p4_low.1d
	pmull	v_x2.1q, v_x2.1d, v_p4_low.1d
	pmull	v_x3.1q, v_x3.1d, v_p4_low.1d

	pmull	v_x0_high.1q, v_x0_high.1d, v_p4_high.1d
	pmull	v_x1_high.1q, v_x1_high.1d, v_p4_high.1d
	pmull	v_x2_high.1q, v_x2_high.1d, v_p4_high.1d
	pmull	v_x3_high.1q, v_x3_high.1d, v_p4_high.1d

	eor	v_x0.16b, v_x0_high.16b, v_x0.16b
	eor	v_x1.16b, v_x1_high.16b, v_x1.16b
	eor	v_x2.16b, v_x2_high.16b, v_x2.16b
	eor	v_x3.16b, v_x3_high.16b, v_x3.16b

	tbl	v_y0_high.16b, {v_y0_high.16b}, v_shuffle.16b
	tbl	v_y1_high.16b, {v_y1_high.16b}, v_shuffle.16b
	tbl	v_y2_high.16b, {v_y2_high.16b}, v_shuffle.16b
	tbl	v_y3_high.16b, {v_y3_high.16b}, v_shuffle.16b

	eor	v_x0.16b, v_x0.16b, v_y0_high.16b
	eor	v_x1.16b, v_x1.16b, v_y1_high.16b
	eor	v_x2.16b, v_x2.16b, v_y2_high.16b
	eor	v_x3.16b, v_x3.16b, v_y3_high.16b
	bne	.clmul_loop

//v_x0		.req	v2
//v_x1		.req	v6
//v_x2		.req	v4
//v_x3		.req	v1

d_p1_high	.req	d7
d_p1_low	.req	d5

v_p1_high	.req	v7
v_p1_low	.req	v5
.clmul_loop_end:
// folding 512bit --> 128bit
	mov	x0, p1_high_b0
	movk	x0, p1_high_b1, lsl 16
	fmov	d_p1_high, x0

	mov	x0, p1_low_b0
	movk	x0, p1_low_b1, lsl 16
	fmov	d_p1_low, x0

	dup	d16, v_x0.d[1]
	pmull	v_x0.1q, v_x0.1d, v_p1_low.1d
	pmull	v16.1q, v16.1d, v_p1_high.1d
	eor	v_x0.16b, v16.16b, v_x0.16b
	eor	v_x1.16b, v_x0.16b, v_x1.16b

	dup	d17, v_x1.d[1]
	pmull	v_x1.1q, v_x1.1d, v_p1_low.1d
	pmull	v17.1q, v17.1d, v_p1_high.1d
	eor	v_x1.16b, v17.16b, v_x1.16b
	eor	v_x2.16b, v_x1.16b, v_x2.16b

	dup	d0, v_x2.d[1]
	pmull	v_x2.1q, v_x2.1d, v_p1_low.1d
	pmull	v0.1q, v0.1d, v_p1_high.1d
	eor	v_x2.16b, v0.16b, v_x2.16b
	eor	v_x3.16b, v_x2.16b, v_x3.16b

//v_x0		.req	v2
//v_x3		.req	v1

d_x0		.req	d2
v_zero		.req	v3
// fold 64b
	movi	v_zero.4s, 0

	mov	x5, p0_high_b0
	movk	x5, p0_high_b1, lsl 16

	mov	x0, p0_low_b0
	movk	x0, p0_low_b1, lsl 16

	dup	d_x0, v_x3.d[1]
	ext	v0.16b, v_zero.16b, v_x3.16b, #8

	fmov	d16, x5
	pmull	v_x0.1q, v_x0.1d, v16.1d

	fmov	d17, x0
	ext	v0.16b, v0.16b, v_zero.16b, #4
	eor	v0.16b, v0.16b, v_x0.16b
	dup	d_x0, v0.d[1]
	pmull	v_x0.1q, v_x0.1d, v17.1d

// barrett reduction
d_br_low	.req	d16
d_br_high	.req	d17

v_br_low	.req	v16
v_br_high	.req	v17
	mov	x4, br_low_b0
	movk	x4, br_low_b1, lsl 16
	movk	x4, br_low_b2, lsl 32

	mov	x3, br_high_b0
	movk	x3, br_high_b1, lsl 16
	movk	x3, br_high_b2, lsl 32

	fmov	d_br_low, x4
	eor	v0.16b, v0.16b, v2.16b
	umov	x0, v0.d[0]
	fmov	d2, x0
	ext	v2.16b, v2.16b, v3.16b, #4
	pmull	v2.1q, v2.1d, v_br_low.1d

	fmov	d_br_high, x3
	ext	v2.16b, v2.16b, v3.16b, #4
	pmull	v2.1q, v2.1d, v_br_high.1d
	eor	v0.16b, v0.16b, v2.16b
	umov	x_seed, v0.d[0]

.clmul_end:
	and	w4, w2, -64
	sxtw	x3, w4
	add	x1, x1, x3
	b	.crc32_norm_tab_pre
	.size	\name, .-\name
	.section	.rodata.cst16,"aM",@progbits,16

	.align	4
.shuffle:
	.byte	15, 14, 13, 12, 11, 10, 9
	.byte	8, 7, 6, 5, 4, 3, 2, 1, 0
.endm
crc: optimize crc with arm64 assembly Change-Id: I49166ee06b3ad24babb90aeb0b834d8aacfc2d03 Signed-off-by: Zhiyuan Zhu <zhiyuan.zhu@arm.com> 2019-06-03 03:09:33 +02:00			`########################################################################`
			`# Copyright(c) 2019 Arm Corporation All rights reserved.`
			`#`
			`# Redistribution and use in source and binary forms, with or without`
			`# modification, are permitted provided that the following conditions`
			`# are met:`
			`# * Redistributions of source code must retain the above copyright`
			`# notice, this list of conditions and the following disclaimer.`
			`# * Redistributions in binary form must reproduce the above copyright`
			`# notice, this list of conditions and the following disclaimer in`
			`# the documentation and/or other materials provided with the`
			`# distribution.`
			`# * Neither the name of Arm Corporation nor the names of its`
			`# contributors may be used to endorse or promote products derived`
			`# from this software without specific prior written permission.`
			`#`
			`# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS`
			`# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT`
			`# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR`
			`# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT`
			`# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,`
			`# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT`
			`# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,`
			`# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY`
			`# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`#########################################################################`

			`.macro crc32_norm_func name:req`
			`.arch armv8-a+crc+crypto`
			`.text`
			`.align 3`
			`.global \name`
			`.type \name, %function`

			`/* crc32_norm_func(uint32_t seed, uint8_t * buf, uint64_t len) */`

			`// constant`
			`.equ FOLD_SIZE, 1024`

			`// parameter`
			`w_seed .req w0`
			`x_seed .req x0`
			`x_buf .req x1`
			`x_len .req x2`

			`x_buf_tmp .req x0`

			`// crc32 normal function entry`
			`\name\():`
			`mvn w_seed, w_seed`
			`mov x3, 0`
			`mov w4, 0`
			`cmp x_len, (FOLD_SIZE - 1)`
			`uxtw x_seed, w_seed`
			`bhi .crc32_clmul_pre`

			`.crc32_norm_tab_pre:`
			`cmp x_len, x3`
			`bls .done`

			`sxtw x4, w4`
			`adrp x5, .LANCHOR0`
			`sub x_buf, x_buf, x4`
			`add x5, x5, :lo12:.LANCHOR0`

			`.align 3`
			`.loop_crc32_norm_tab:`
			`ldrb w3, [x_buf, x4]`
			`add x4, x4, 1`
			`cmp x_len, x4`
			`eor x3, x3, x0, lsr 24`
			`and x3, x3, 255`
			`ldr w3, [x5, x3, lsl 2]`
			`eor x0, x3, x0, lsl 8`
			`bhi .loop_crc32_norm_tab`

			`.done:`
			`mvn w_seed, w_seed`
			`ret`

			`// crcc32 clmul prepare`

			`x_buf_end .req x3`
			`q_shuffle .req q3`
			`v_shuffle .req v3`

			`q_x0_tmp .req q5`
			`q_x1 .req q6`
			`q_x2 .req q4`
			`q_x3 .req q1`

			`v_x0_tmp .req v5`
			`v_x0 .req v2`
			`v_x1 .req v6`
			`v_x2 .req v4`
			`v_x3 .req v1`

			`d_p4_high .req d7`
			`d_p4_low .req d5`

			`.align 2`
			`.crc32_clmul_pre:`
			`and x3, x_len, -64`
			`cmp x3, 63`
			`bls .clmul_end`

			`lsl x_seed, x_seed, 32`
			`movi v2.4s, 0`
			`ins v2.d[1], x_seed`

			`adrp x4, .shuffle`
			`ldr q_shuffle, [x4, #:lo12:.shuffle]`

			`sub x4, x3, #64`
			`cmp x4, 63`

			`ldr q_x0_tmp, [x_buf]`
			`ldr q_x1, [x_buf, 16]`
			`ldr q_x2, [x_buf, 32]`
			`ldr q_x3, [x_buf, 48]`
			`add x_buf_tmp, x_buf, 64`

			`tbl v_x0_tmp.16b, {v_x0_tmp.16b}, v_shuffle.16b`
			`tbl v_x1.16b, {v_x1.16b}, v_shuffle.16b`
			`tbl v_x2.16b, {v_x2.16b}, v_shuffle.16b`
			`tbl v_x3.16b, {v_x3.16b}, v_shuffle.16b`
			`eor v_x0.16b, v_x0.16b, v_x0_tmp.16b`
			`bls .clmul_loop_end`

			`add x_buf_end, x_buf_tmp, x4`
			`mov x4, p4_high_b0`
			`movk x4, p4_high_b1, lsl 16`
			`fmov d_p4_high, x4`

			`mov x4, p4_low_b0`
			`movk x4, p4_low_b1, lsl 16`
			`fmov d_p4_low, x4`

			`// crc32 clmul loop`
			`//v_x0 .req v2`
			`//v_x1 .req v6`
			`//v_x2 .req v4`
			`//v_x3 .req v1`

			`d_x0_high .req d22`
			`d_x1_high .req d20`
			`d_x2_high .req d18`
			`d_x3_high .req d16`

			`v_x0_high .req v22`
			`v_x1_high .req v20`
			`v_x2_high .req v18`
			`v_x3_high .req v16`

			`q_y0_high .req q23`
			`q_y1_high .req q21`
			`q_y2_high .req q19`
			`q_y3_high .req q17`

			`v_y0_high .req v23`
			`v_y1_high .req v21`
			`v_y2_high .req v19`
			`v_y3_high .req v17`

			`v_p4_high .req v7`
			`v_p4_low .req v5`
			`//v_shuffle .req v3`

			`.align 3`
			`.clmul_loop:`
			`dup d_x0_high, v_x0.d[1]`
			`dup d_x1_high, v_x1.d[1]`
			`dup d_x2_high, v_x2.d[1]`
			`dup d_x3_high, v_x3.d[1]`

			`add x_buf_tmp, x_buf_tmp, 64`

			`ldr q_y0_high, [x_buf_tmp, -64]`
			`ldr q_y1_high, [x_buf_tmp, -48]`
			`ldr q_y2_high, [x_buf_tmp, -32]`
			`ldr q_y3_high, [x_buf_tmp, -16]`

			`cmp x_buf_tmp, x_buf_end`

			`pmull v_x0.1q, v_x0.1d, v_p4_low.1d`
			`pmull v_x1.1q, v_x1.1d, v_p4_low.1d`
			`pmull v_x2.1q, v_x2.1d, v_p4_low.1d`
			`pmull v_x3.1q, v_x3.1d, v_p4_low.1d`

			`pmull v_x0_high.1q, v_x0_high.1d, v_p4_high.1d`
			`pmull v_x1_high.1q, v_x1_high.1d, v_p4_high.1d`
			`pmull v_x2_high.1q, v_x2_high.1d, v_p4_high.1d`
			`pmull v_x3_high.1q, v_x3_high.1d, v_p4_high.1d`

			`eor v_x0.16b, v_x0_high.16b, v_x0.16b`
			`eor v_x1.16b, v_x1_high.16b, v_x1.16b`
			`eor v_x2.16b, v_x2_high.16b, v_x2.16b`
			`eor v_x3.16b, v_x3_high.16b, v_x3.16b`

			`tbl v_y0_high.16b, {v_y0_high.16b}, v_shuffle.16b`
			`tbl v_y1_high.16b, {v_y1_high.16b}, v_shuffle.16b`
			`tbl v_y2_high.16b, {v_y2_high.16b}, v_shuffle.16b`
			`tbl v_y3_high.16b, {v_y3_high.16b}, v_shuffle.16b`

			`eor v_x0.16b, v_x0.16b, v_y0_high.16b`
			`eor v_x1.16b, v_x1.16b, v_y1_high.16b`
			`eor v_x2.16b, v_x2.16b, v_y2_high.16b`
			`eor v_x3.16b, v_x3.16b, v_y3_high.16b`
			`bne .clmul_loop`

			`//v_x0 .req v2`
			`//v_x1 .req v6`
			`//v_x2 .req v4`
			`//v_x3 .req v1`

			`d_p1_high .req d7`
			`d_p1_low .req d5`

			`v_p1_high .req v7`
			`v_p1_low .req v5`
			`.clmul_loop_end:`
			`// folding 512bit --> 128bit`
			`mov x0, p1_high_b0`
			`movk x0, p1_high_b1, lsl 16`
			`fmov d_p1_high, x0`

			`mov x0, p1_low_b0`
			`movk x0, p1_low_b1, lsl 16`
			`fmov d_p1_low, x0`

			`dup d16, v_x0.d[1]`
			`pmull v_x0.1q, v_x0.1d, v_p1_low.1d`
			`pmull v16.1q, v16.1d, v_p1_high.1d`
			`eor v_x0.16b, v16.16b, v_x0.16b`
			`eor v_x1.16b, v_x0.16b, v_x1.16b`

			`dup d17, v_x1.d[1]`
			`pmull v_x1.1q, v_x1.1d, v_p1_low.1d`
			`pmull v17.1q, v17.1d, v_p1_high.1d`
			`eor v_x1.16b, v17.16b, v_x1.16b`
			`eor v_x2.16b, v_x1.16b, v_x2.16b`

			`dup d0, v_x2.d[1]`
			`pmull v_x2.1q, v_x2.1d, v_p1_low.1d`
			`pmull v0.1q, v0.1d, v_p1_high.1d`
			`eor v_x2.16b, v0.16b, v_x2.16b`
			`eor v_x3.16b, v_x2.16b, v_x3.16b`

			`//v_x0 .req v2`
			`//v_x3 .req v1`

			`d_x0 .req d2`
			`v_zero .req v3`
			`// fold 64b`
			`movi v_zero.4s, 0`

			`mov x5, p0_high_b0`
			`movk x5, p0_high_b1, lsl 16`

			`mov x0, p0_low_b0`
			`movk x0, p0_low_b1, lsl 16`

			`dup d_x0, v_x3.d[1]`
			`ext v0.16b, v_zero.16b, v_x3.16b, #8`

			`fmov d16, x5`
			`pmull v_x0.1q, v_x0.1d, v16.1d`

			`fmov d17, x0`
			`ext v0.16b, v0.16b, v_zero.16b, #4`
			`eor v0.16b, v0.16b, v_x0.16b`
			`dup d_x0, v0.d[1]`
			`pmull v_x0.1q, v_x0.1d, v17.1d`

			`// barrett reduction`
			`d_br_low .req d16`
			`d_br_high .req d17`

			`v_br_low .req v16`
			`v_br_high .req v17`
			`mov x4, br_low_b0`
			`movk x4, br_low_b1, lsl 16`
			`movk x4, br_low_b2, lsl 32`

			`mov x3, br_high_b0`
			`movk x3, br_high_b1, lsl 16`
			`movk x3, br_high_b2, lsl 32`

			`fmov d_br_low, x4`
			`eor v0.16b, v0.16b, v2.16b`
			`umov x0, v0.d[0]`
			`fmov d2, x0`
			`ext v2.16b, v2.16b, v3.16b, #4`
			`pmull v2.1q, v2.1d, v_br_low.1d`

			`fmov d_br_high, x3`
			`ext v2.16b, v2.16b, v3.16b, #4`
			`pmull v2.1q, v2.1d, v_br_high.1d`
			`eor v0.16b, v0.16b, v2.16b`
			`umov x_seed, v0.d[0]`

			`.clmul_end:`
			`and w4, w2, -64`
			`sxtw x3, w4`
			`add x1, x1, x3`
			`b .crc32_norm_tab_pre`
			`.size \name, .-\name`
			`.section .rodata.cst16,"aM",@progbits,16`

			`.align 4`
			`.shuffle:`
			`.byte 15, 14, 13, 12, 11, 10, 9`
			`.byte 8, 7, 6, 5, 4, 3, 2, 1, 0`
			`.endm`