mirror of
https://github.com/intel/isa-l.git
synced 2024-12-13 09:52:56 +01:00
a46da529d9
Change-Id: I49166ee06b3ad24babb90aeb0b834d8aacfc2d03 Signed-off-by: Zhiyuan Zhu <zhiyuan.zhu@arm.com>
303 lines
7.3 KiB
C
303 lines
7.3 KiB
C
########################################################################
|
|
# Copyright(c) 2019 Arm Corporation All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in
|
|
# the documentation and/or other materials provided with the
|
|
# distribution.
|
|
# * Neither the name of Arm Corporation nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#########################################################################
|
|
|
|
.macro crc64_refl_func name:req
|
|
.arch armv8-a+crc+crypto
|
|
.text
|
|
.align 3
|
|
.global \name
|
|
.type \name, %function
|
|
|
|
// parameter
|
|
x_seed .req x0
|
|
x_buf .req x1
|
|
x_len .req x2
|
|
|
|
// return
|
|
x_crc_ret .req x0
|
|
|
|
// constant
|
|
.equ FOLD_SIZE, 1024
|
|
|
|
// global variable
|
|
x_buf_end .req x3
|
|
x_counter .req x4
|
|
x_buf_iter .req x5
|
|
x_crc64_tab_addr .req x6
|
|
w_tmp .req w7
|
|
x_tmp .req x7
|
|
|
|
// crc64 refl function entry
|
|
\name\():
|
|
// crc64 for table
|
|
mvn x_seed, x_seed
|
|
mov x_counter, 0
|
|
cmp x_len, (FOLD_SIZE-1)
|
|
bhi .crc64_clmul_pre
|
|
.crc64_tab_pre:
|
|
cmp x_len, x_counter
|
|
bls .done
|
|
|
|
adrp x_tmp, .lanchor_crc64_tab
|
|
add x_buf_iter, x_buf, x_counter
|
|
add x_buf, x_buf, x_len
|
|
add x_crc64_tab_addr, x_tmp, :lo12:.lanchor_crc64_tab
|
|
|
|
.align 3
|
|
.loop_crc64_tab:
|
|
ldrb w_tmp, [x_buf_iter], 1
|
|
eor w_tmp, w_tmp, w0
|
|
cmp x_buf, x_buf_iter
|
|
and x_tmp, x_tmp, 255
|
|
ldr x_tmp, [x_crc64_tab_addr, x_tmp, lsl 3]
|
|
eor x_seed, x_tmp, x_seed, lsr 8
|
|
bne .loop_crc64_tab
|
|
.done:
|
|
mvn x_crc_ret, x_crc_ret
|
|
ret
|
|
|
|
// clmul prepare
|
|
q_x0 .req q0
|
|
q_x1 .req q4
|
|
q_x2 .req q6
|
|
q_x3 .req q1
|
|
|
|
v_x0 .req v0
|
|
v_x1 .req v4
|
|
v_x2 .req v6
|
|
v_x3 .req v1
|
|
|
|
d_p4_high .req d17
|
|
d_p4_low .req d7
|
|
v_p4_high .req v17
|
|
v_p4_low .req v7
|
|
|
|
d_y0_tmp .req d0
|
|
v_y0_tmp .req v0
|
|
|
|
q_tmp .req q2
|
|
v_tmp .req v2
|
|
|
|
.align 2
|
|
.crc64_clmul_pre:
|
|
ldr q_tmp, [x_buf]
|
|
ldr q_x1, [x_buf, 16]
|
|
ldr q_x2, [x_buf, 32]
|
|
ldr q_x3, [x_buf, 48]
|
|
|
|
and x_counter, x_len, -64
|
|
sub x_tmp, x_counter, #64
|
|
cmp x_tmp, 63
|
|
|
|
fmov d_y0_tmp, x_seed // save crc to d0
|
|
eor v_x0.16b, v_y0_tmp.16b, v_tmp.16b
|
|
|
|
add x_buf_iter, x_buf, 64
|
|
bls .clmul_loop_end
|
|
|
|
add x_buf_end, x_buf_iter, x_tmp
|
|
|
|
mov x_tmp, p4_high_b0
|
|
movk x_tmp, p4_high_b1, lsl 16
|
|
movk x_tmp, p4_high_b2, lsl 32
|
|
movk x_tmp, p4_high_b3, lsl 48
|
|
fmov d_p4_high, x_tmp
|
|
|
|
mov x_tmp, p4_low_b0
|
|
movk x_tmp, p4_low_b1, lsl 16
|
|
movk x_tmp, p4_low_b2, lsl 32
|
|
movk x_tmp, p4_low_b3, lsl 48
|
|
fmov d_p4_low, x_tmp
|
|
|
|
// 1024bit --> 512bit loop
|
|
// merge x0, x1, x2, x3, y0, y1, y2, y3 => x0, x1, x2, x3 (uint64x2_t)
|
|
d_x0_high .req d24
|
|
d_x1_high .req d22
|
|
d_x2_high .req d20
|
|
d_x3_high .req d16
|
|
|
|
v_x0_high .req v24
|
|
v_x1_high .req v22
|
|
v_x2_high .req v20
|
|
v_x3_high .req v16
|
|
|
|
q_x0_tmp .req q2
|
|
q_x1_tmp .req q5
|
|
q_x2_tmp .req q3
|
|
q_x3_tmp .req q18
|
|
|
|
v_x0_tmp .req v2
|
|
v_x1_tmp .req v5
|
|
v_x2_tmp .req v3
|
|
v_x3_tmp .req v18
|
|
|
|
q_x0_tmp .req q2
|
|
q_x1_tmp .req q5
|
|
q_x2_tmp .req q3
|
|
q_x3_tmp .req q18
|
|
|
|
.align 3
|
|
.clmul_loop:
|
|
add x_buf_iter, x_buf_iter, 64
|
|
cmp x_buf_iter, x_buf_end
|
|
|
|
dup d_x0_high, v_x0.d[1]
|
|
dup d_x1_high, v_x1.d[1]
|
|
dup d_x2_high, v_x2.d[1]
|
|
dup d_x3_high, v_x3.d[1]
|
|
|
|
pmull v_x0_high.1q, v_x0_high.1d, v_p4_high.1d
|
|
pmull v_x1_high.1q, v_x1_high.1d, v_p4_high.1d
|
|
pmull v_x2_high.1q, v_x2_high.1d, v_p4_high.1d
|
|
pmull v_x3_high.1q, v_x3_high.1d, v_p4_high.1d
|
|
|
|
pmull v_x0.1q, v_x0.1d, v_p4_low.1d
|
|
pmull v_x1.1q, v_x1.1d, v_p4_low.1d
|
|
pmull v_x2.1q, v_x2.1d, v_p4_low.1d
|
|
pmull v_x3.1q, v_x3.1d, v_p4_low.1d
|
|
|
|
ldr q_x0_tmp, [x_buf_iter, -64]
|
|
ldr q_x1_tmp, [x_buf_iter, -48]
|
|
ldr q_x2_tmp, [x_buf_iter, -32]
|
|
ldr q_x3_tmp, [x_buf_iter, -16]
|
|
|
|
eor v_x0_tmp.16b, v_x0_tmp.16b, v_x0_high.16b
|
|
eor v_x1_tmp.16b, v_x1_tmp.16b, v_x1_high.16b
|
|
eor v_x2_tmp.16b, v_x2_tmp.16b, v_x2_high.16b
|
|
eor v_x3_tmp.16b, v_x3_tmp.16b, v_x3_high.16b
|
|
|
|
eor v_x0.16b, v_x0_tmp.16b, v_x0.16b
|
|
eor v_x1.16b, v_x1_tmp.16b, v_x1.16b
|
|
eor v_x2.16b, v_x2_tmp.16b, v_x2.16b
|
|
eor v_x3.16b, v_x3_tmp.16b, v_x3.16b
|
|
bne .clmul_loop
|
|
|
|
// folding 512bit --> 128bit
|
|
// merge x0, x1, x2, x3 => x3 (uint64x2_t)
|
|
// input: x0 -> v_x0, x1 -> v_x1, x2 -> v_x2, x3 -> v_x3
|
|
// output: v_x3
|
|
d_p1_high .req d5
|
|
d_p1_low .req d3
|
|
v_p1_high .req v5
|
|
v_p1_low .req v3
|
|
|
|
d_tmp_high .req d16
|
|
d_tmp_low .req d2
|
|
v_tmp_high .req v16
|
|
v_tmp_low .req v2
|
|
|
|
.clmul_loop_end:
|
|
mov x_tmp, p1_high_b0
|
|
movk x_tmp, p1_high_b1, lsl 16
|
|
movk x_tmp, p1_high_b2, lsl 32
|
|
movk x_tmp, p1_high_b3, lsl 48
|
|
fmov d_p1_high, x_tmp
|
|
|
|
mov x_tmp, p1_low_b0
|
|
movk x_tmp, p1_low_b1, lsl 16
|
|
movk x_tmp, p1_low_b2, lsl 32
|
|
movk x_tmp, p1_low_b3, lsl 48
|
|
fmov d_p1_low, x_tmp
|
|
|
|
dup d_tmp_high, v_x0.d[1]
|
|
dup d_tmp_low, v_x0.d[0]
|
|
|
|
pmull v_tmp_high.1q, v_tmp_high.1d, v_p1_high.1d
|
|
pmull v_tmp_low.1q, v_tmp_low.1d, v_p1_low.1d
|
|
eor v_tmp_high.16b, v_tmp_high.16b, v_tmp_low.16b
|
|
eor v_x1.16b, v_tmp_high.16b, v_x1.16b
|
|
|
|
dup d_tmp_high, v_x1.d[1]
|
|
pmull v_x1.1q, v_x1.1d, v_p1_low.1d
|
|
pmull v_tmp_high.1q, v_tmp_high.1d, v_p1_high.1d
|
|
eor v_tmp_high.16b, v_tmp_high.16b, v_x1.16b
|
|
eor v_x2.16b, v_tmp_high.16b, v_x2.16b
|
|
|
|
dup d_tmp_high, v_x2.d[1]
|
|
pmull v_x2.1q, v_x2.1d, v_p1_low.1d
|
|
pmull v_tmp_high.1q, v_tmp_high.1d, v_p1_high.1d
|
|
eor v_tmp_high.16b, v_tmp_high.16b, v_x2.16b
|
|
eor v_x3.16b, v_tmp_high.16b, v_x3.16b
|
|
|
|
// fold 64b
|
|
// input: v_x3
|
|
// output: v_x3
|
|
d_p0_low .req d3
|
|
v_p0_low .req v3
|
|
d_x3_low_fold_64b .req d2
|
|
v_x3_low_fold_64b .req v2
|
|
v_zero_fold_64b .req v0
|
|
mov x_tmp, p0_low_b0
|
|
movk x_tmp, p0_low_b1, lsl 16
|
|
movk x_tmp, p0_low_b2, lsl 32
|
|
movk x_tmp, p0_low_b3, lsl 48
|
|
fmov d_p0_low, x_tmp
|
|
|
|
dup d_x3_low_fold_64b, v_x3.d[0]
|
|
movi v_zero_fold_64b.4s, 0
|
|
ext v_x3.16b, v_x3.16b, v0.16b, #8
|
|
|
|
pmull v_x3_low_fold_64b.1q, v_x3_low_fold_64b.1d, v_p0_low.1d
|
|
eor v_x3.16b, v_x3.16b, v_x3_low_fold_64b.16b
|
|
|
|
// barrett reduction
|
|
// input: v_x3
|
|
// output: x0
|
|
d_br_low .req d3
|
|
d_br_high .req d5
|
|
v_br_low .req v3
|
|
v_br_high .req v5
|
|
mov x0, br_low_b0
|
|
movk x0, br_low_b1, lsl 16
|
|
movk x0, br_low_b2, lsl 32
|
|
movk x0, br_low_b3, lsl 48
|
|
fmov d_br_low, x0
|
|
|
|
mov x0, br_high_b0
|
|
movk x0, br_high_b1, lsl 16
|
|
movk x0, br_high_b2, lsl 32
|
|
movk x0, br_high_b3, lsl 48
|
|
fmov d_br_high, x0
|
|
|
|
dup d2, v_x3.d[0]
|
|
|
|
pmull v2.1q, v2.1d, v_br_low.1d
|
|
pmull v4.1q, v2.1d, v_br_high.1d
|
|
|
|
ext v0.16b, v0.16b, v2.16b, #8
|
|
|
|
eor v0.16b, v0.16b, v4.16b
|
|
eor v0.16b, v0.16b, v_x3.16b
|
|
umov x0, v0.d[1]
|
|
|
|
b .crc64_tab_pre
|
|
|
|
.size \name, .-\name
|
|
.endm
|