isa-l/crc/aarch64/crc32_norm_common_pmull.h

317 lines
7.0 KiB
C
Raw Normal View History

########################################################################
# Copyright(c) 2019 Arm Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Arm Corporation nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#########################################################################
.macro crc32_norm_func name:req
.arch armv8-a+crc+crypto
.text
.align 3
.global \name
.type \name, %function
/* crc32_norm_func(uint32_t seed, uint8_t * buf, uint64_t len) */
// constant
.equ FOLD_SIZE, 1024
// parameter
w_seed .req w0
x_seed .req x0
x_buf .req x1
x_len .req x2
x_buf_tmp .req x0
// crc32 normal function entry
\name\():
mvn w_seed, w_seed
mov x3, 0
mov w4, 0
cmp x_len, (FOLD_SIZE - 1)
uxtw x_seed, w_seed
bhi .crc32_clmul_pre
.crc32_norm_tab_pre:
cmp x_len, x3
bls .done
sxtw x4, w4
adrp x5, .LANCHOR0
sub x_buf, x_buf, x4
add x5, x5, :lo12:.LANCHOR0
.align 3
.loop_crc32_norm_tab:
ldrb w3, [x_buf, x4]
add x4, x4, 1
cmp x_len, x4
eor x3, x3, x0, lsr 24
and x3, x3, 255
ldr w3, [x5, x3, lsl 2]
eor x0, x3, x0, lsl 8
bhi .loop_crc32_norm_tab
.done:
mvn w_seed, w_seed
ret
// crcc32 clmul prepare
x_buf_end .req x3
q_shuffle .req q3
v_shuffle .req v3
q_x0_tmp .req q5
q_x1 .req q6
q_x2 .req q4
q_x3 .req q1
v_x0_tmp .req v5
v_x0 .req v2
v_x1 .req v6
v_x2 .req v4
v_x3 .req v1
d_p4_high .req d7
d_p4_low .req d5
.align 2
.crc32_clmul_pre:
and x3, x_len, -64
cmp x3, 63
bls .clmul_end
lsl x_seed, x_seed, 32
movi v2.4s, 0
ins v2.d[1], x_seed
adrp x4, .shuffle
ldr q_shuffle, [x4, #:lo12:.shuffle]
sub x4, x3, #64
cmp x4, 63
ldr q_x0_tmp, [x_buf]
ldr q_x1, [x_buf, 16]
ldr q_x2, [x_buf, 32]
ldr q_x3, [x_buf, 48]
add x_buf_tmp, x_buf, 64
tbl v_x0_tmp.16b, {v_x0_tmp.16b}, v_shuffle.16b
tbl v_x1.16b, {v_x1.16b}, v_shuffle.16b
tbl v_x2.16b, {v_x2.16b}, v_shuffle.16b
tbl v_x3.16b, {v_x3.16b}, v_shuffle.16b
eor v_x0.16b, v_x0.16b, v_x0_tmp.16b
bls .clmul_loop_end
add x_buf_end, x_buf_tmp, x4
mov x4, p4_high_b0
movk x4, p4_high_b1, lsl 16
fmov d_p4_high, x4
mov x4, p4_low_b0
movk x4, p4_low_b1, lsl 16
fmov d_p4_low, x4
// crc32 clmul loop
//v_x0 .req v2
//v_x1 .req v6
//v_x2 .req v4
//v_x3 .req v1
d_x0_high .req d22
d_x1_high .req d20
d_x2_high .req d18
d_x3_high .req d16
v_x0_high .req v22
v_x1_high .req v20
v_x2_high .req v18
v_x3_high .req v16
q_y0_high .req q23
q_y1_high .req q21
q_y2_high .req q19
q_y3_high .req q17
v_y0_high .req v23
v_y1_high .req v21
v_y2_high .req v19
v_y3_high .req v17
v_p4_high .req v7
v_p4_low .req v5
//v_shuffle .req v3
.align 3
.clmul_loop:
dup d_x0_high, v_x0.d[1]
dup d_x1_high, v_x1.d[1]
dup d_x2_high, v_x2.d[1]
dup d_x3_high, v_x3.d[1]
add x_buf_tmp, x_buf_tmp, 64
ldr q_y0_high, [x_buf_tmp, -64]
ldr q_y1_high, [x_buf_tmp, -48]
ldr q_y2_high, [x_buf_tmp, -32]
ldr q_y3_high, [x_buf_tmp, -16]
cmp x_buf_tmp, x_buf_end
pmull v_x0.1q, v_x0.1d, v_p4_low.1d
pmull v_x1.1q, v_x1.1d, v_p4_low.1d
pmull v_x2.1q, v_x2.1d, v_p4_low.1d
pmull v_x3.1q, v_x3.1d, v_p4_low.1d
pmull v_x0_high.1q, v_x0_high.1d, v_p4_high.1d
pmull v_x1_high.1q, v_x1_high.1d, v_p4_high.1d
pmull v_x2_high.1q, v_x2_high.1d, v_p4_high.1d
pmull v_x3_high.1q, v_x3_high.1d, v_p4_high.1d
eor v_x0.16b, v_x0_high.16b, v_x0.16b
eor v_x1.16b, v_x1_high.16b, v_x1.16b
eor v_x2.16b, v_x2_high.16b, v_x2.16b
eor v_x3.16b, v_x3_high.16b, v_x3.16b
tbl v_y0_high.16b, {v_y0_high.16b}, v_shuffle.16b
tbl v_y1_high.16b, {v_y1_high.16b}, v_shuffle.16b
tbl v_y2_high.16b, {v_y2_high.16b}, v_shuffle.16b
tbl v_y3_high.16b, {v_y3_high.16b}, v_shuffle.16b
eor v_x0.16b, v_x0.16b, v_y0_high.16b
eor v_x1.16b, v_x1.16b, v_y1_high.16b
eor v_x2.16b, v_x2.16b, v_y2_high.16b
eor v_x3.16b, v_x3.16b, v_y3_high.16b
bne .clmul_loop
//v_x0 .req v2
//v_x1 .req v6
//v_x2 .req v4
//v_x3 .req v1
d_p1_high .req d7
d_p1_low .req d5
v_p1_high .req v7
v_p1_low .req v5
.clmul_loop_end:
// folding 512bit --> 128bit
mov x0, p1_high_b0
movk x0, p1_high_b1, lsl 16
fmov d_p1_high, x0
mov x0, p1_low_b0
movk x0, p1_low_b1, lsl 16
fmov d_p1_low, x0
dup d16, v_x0.d[1]
pmull v_x0.1q, v_x0.1d, v_p1_low.1d
pmull v16.1q, v16.1d, v_p1_high.1d
eor v_x0.16b, v16.16b, v_x0.16b
eor v_x1.16b, v_x0.16b, v_x1.16b
dup d17, v_x1.d[1]
pmull v_x1.1q, v_x1.1d, v_p1_low.1d
pmull v17.1q, v17.1d, v_p1_high.1d
eor v_x1.16b, v17.16b, v_x1.16b
eor v_x2.16b, v_x1.16b, v_x2.16b
dup d0, v_x2.d[1]
pmull v_x2.1q, v_x2.1d, v_p1_low.1d
pmull v0.1q, v0.1d, v_p1_high.1d
eor v_x2.16b, v0.16b, v_x2.16b
eor v_x3.16b, v_x2.16b, v_x3.16b
//v_x0 .req v2
//v_x3 .req v1
d_x0 .req d2
v_zero .req v3
// fold 64b
movi v_zero.4s, 0
mov x5, p0_high_b0
movk x5, p0_high_b1, lsl 16
mov x0, p0_low_b0
movk x0, p0_low_b1, lsl 16
dup d_x0, v_x3.d[1]
ext v0.16b, v_zero.16b, v_x3.16b, #8
fmov d16, x5
pmull v_x0.1q, v_x0.1d, v16.1d
fmov d17, x0
ext v0.16b, v0.16b, v_zero.16b, #4
eor v0.16b, v0.16b, v_x0.16b
dup d_x0, v0.d[1]
pmull v_x0.1q, v_x0.1d, v17.1d
// barrett reduction
d_br_low .req d16
d_br_high .req d17
v_br_low .req v16
v_br_high .req v17
mov x4, br_low_b0
movk x4, br_low_b1, lsl 16
movk x4, br_low_b2, lsl 32
mov x3, br_high_b0
movk x3, br_high_b1, lsl 16
movk x3, br_high_b2, lsl 32
fmov d_br_low, x4
eor v0.16b, v0.16b, v2.16b
umov x0, v0.d[0]
fmov d2, x0
ext v2.16b, v2.16b, v3.16b, #4
pmull v2.1q, v2.1d, v_br_low.1d
fmov d_br_high, x3
ext v2.16b, v2.16b, v3.16b, #4
pmull v2.1q, v2.1d, v_br_high.1d
eor v0.16b, v0.16b, v2.16b
umov x_seed, v0.d[0]
.clmul_end:
and w4, w2, -64
sxtw x3, w4
add x1, x1, x3
b .crc32_norm_tab_pre
.size \name, .-\name
.section .rodata.cst16,"aM",@progbits,16
.align 4
.shuffle:
.byte 15, 14, 13, 12, 11, 10, 9
.byte 8, 7, 6, 5, 4, 3, 2, 1, 0
.endm