isa-l/crc/aarch64/crc64_refl_common_pmull.h
liuqinfei 4815174a68 crc: optimize by supporting arm xor fusion feature
Arrange the two xor instructions according to the specified
paradigm, then the two xor instructions can be fused to execute
which can save one issue slot and one execution latency.

Change-Id: Ic64bcfe569b2468e4dc9c13d073d367cc81fd937
Signed-off-by: liuqinfei <lucas.liuqinfei@huawei.com>
2023-08-18 07:53:59 +00:00

137 lines
3.9 KiB
C

########################################################################
# Copyright(c) 2019 Arm Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Arm Corporation nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#########################################################################
#include "crc_common_pmull.h"
.macro crc64_refl_func name:req
.arch armv8-a+crypto
.text
.align 3
.global cdecl(\name)
#ifndef __APPLE__
.type \name, %function
#endif
/* uint64_t crc64_refl_func(uint64_t seed, const uint8_t * buf, uint64_t len) */
cdecl(\name\()):
mvn x_seed, x_seed
mov x_counter, 0
cmp x_len, (FOLD_SIZE-1)
bhi .crc_clmul_pre
.crc_tab_pre:
cmp x_len, x_counter
bls .done
#ifndef __APPLE__
adrp x_tmp, .lanchor_crc_tab
add x_buf_iter, x_buf, x_counter
add x_buf, x_buf, x_len
add x_crc_tab_addr, x_tmp, :lo12:.lanchor_crc_tab
#else
adrp x_tmp, .lanchor_crc_tab@PAGE
add x_buf_iter, x_buf, x_counter
add x_buf, x_buf, x_len
add x_crc_tab_addr, x_tmp, .lanchor_crc_tab@PAGEOFF
#endif
.align 3
.loop_crc_tab:
ldrb w_tmp, [x_buf_iter], 1
eor w_tmp, w_tmp, w0
cmp x_buf, x_buf_iter
and x_tmp, x_tmp, 255
ldr x_tmp, [x_crc_tab_addr, x_tmp, lsl 3]
eor x_seed, x_tmp, x_seed, lsr 8
bne .loop_crc_tab
.done:
mvn x_crc_ret, x_seed
ret
.align 2
.crc_clmul_pre:
fmov d_x0, x_seed // save crc to d_x0
crc_refl_load_first_block
bls .clmul_loop_end
crc64_load_p4
// 1024bit --> 512bit loop
// merge x0, x1, x2, x3, y0, y1, y2, y3 => x0, x1, x2, x3 (uint64x2_t)
crc_refl_loop
.clmul_loop_end:
// folding 512bit --> 128bit
crc64_fold_512b_to_128b
// folding 128bit --> 64bit
mov x_tmp, p0_low_b0
movk x_tmp, p0_low_b1, lsl 16
movk x_tmp, p0_low_b2, lsl 32
movk x_tmp, p0_low_b3, lsl 48
fmov d_p0_low, x_tmp
pmull v_tmp_low.1q, v_x3.1d, v_p0.1d
mov d_tmp_high, v_x3.d[1]
eor v_x3.16b, v_tmp_high.16b, v_tmp_low.16b
// barrett reduction
mov x_tmp, br_low_b0
movk x_tmp, br_low_b1, lsl 16
movk x_tmp, br_low_b2, lsl 32
movk x_tmp, br_low_b3, lsl 48
fmov d_br_low, x_tmp
mov x_tmp2, br_high_b0
movk x_tmp2, br_high_b1, lsl 16
movk x_tmp2, br_high_b2, lsl 32
movk x_tmp2, br_high_b3, lsl 48
fmov d_br_high, x_tmp2
pmull v_tmp_low.1q, v_x3.1d, v_br_low.1d
pmull v_tmp_high.1q, v_tmp_low.1d, v_br_high.1d
ext v_tmp_low.16b, v_br_low.16b, v_tmp_low.16b, #8
eor v_tmp_low.16b, v_tmp_high.16b, v_tmp_low.16b
eor v_tmp_low.16b, v_x3.16b, v_tmp_low.16b
umov x_crc_ret, v_tmp_low.d[1]
b .crc_tab_pre
#ifndef __APPLE__
.size \name, .-\name
#endif
.endm