isa-l/crc/aarch64/crc32_gzip_refl_hw_fold.S
Zhiyuan Zhu a46da529d9 crc: optimize crc with arm64 assembly
Change-Id: I49166ee06b3ad24babb90aeb0b834d8aacfc2d03
Signed-off-by: Zhiyuan Zhu <zhiyuan.zhu@arm.com>
2019-06-21 17:02:16 +08:00

177 lines
4.2 KiB
ArmAsm

########################################################################
# Copyright(c) 2019 Arm Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Arm Corporation nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#########################################################################
.arch armv8-a+crc+crypto
.text
.align 3
.global crc32_gzip_refl_hw_fold
.type crc32_gzip_refl_hw_fold, %function
/* uint32_t crc32_gzip_refl_hw_fold(uint32_t seed, const unsigned char *buf, uint64_t len) */
w_seed .req w0
w_crc .req w0
x_buf .req x1
x_len .req x2
x_buf_loop_end .req x10
x_buf_iter .req x10
x_tmp .req x15
w_tmp .req w15
d_c0 .req d3
d_c1 .req d1
v_c0 .req v3
v_c1 .req v1
crc32_gzip_refl_hw_fold:
mvn w_seed, w_seed
cmp x_len, 1023
mov x_buf_iter, x_buf
bls .loop_fold_end
sub x_buf_loop_end, x_len, #1024
and x_buf_loop_end, x_buf_loop_end, -1024
add x_buf_loop_end, x_buf_loop_end, 1024
add x_buf_loop_end, x_buf, x_buf_loop_end
mov x_tmp, 0x819b
movk x_tmp, 0xb486, lsl 16
fmov d_c0, x_tmp
mov x_tmp, 0x8617
movk x_tmp, 0x7627, lsl 16
fmov d_c1, x_tmp
x_in64 .req x3
w_crc0 .req w0
w_crc1 .req w4
w_crc2 .req w5
d_crc0 .req d4
d_crc1 .req d5
v_crc0 .req v4
v_crc1 .req v5
.align 3
.loop_fold:
add x9, x_buf, 336
mov x_in64, x_buf
mov w_crc1, 0
mov w_crc2, 0
.align 3
.loop_for:
ldr x8, [x_in64]
ldr x7, [x_in64, 336]
ldr x6, [x_in64, 672]
add x_in64, x_in64, 8
cmp x_in64, x9
crc32x w_crc0, w_crc0, x8
crc32x w_crc1, w_crc1, x7
crc32x w_crc2, w_crc2, x6
bne .loop_for
uxtw x_tmp, w_crc0
fmov d_crc0, x_tmp
pmull v_crc0.1q, v_crc0.1d, v_c0.1d
uxtw x_tmp, w_crc1
fmov d_crc1, x_tmp
pmull v_crc1.1q, v_crc1.1d, v_c1.1d
ldr x_tmp, [x_buf, 1008]
crc32x w_crc2, w_crc2, x_tmp
fmov x_tmp, d_crc0
crc32x w_crc0, wzr, x_tmp
fmov x_tmp, d_crc1
crc32x w_crc1, wzr, x_tmp
eor w_crc0, w_crc0, w_crc1
eor w_crc0, w_crc0, w_crc2
ldr x_tmp, [x_buf, 1016]
crc32x w_crc0, w_crc0, x_tmp
add x_buf, x_buf, 1024
cmp x_buf_loop_end, x_buf
bne .loop_fold
and x_len, x_len, 1023
x_buf_loop_size8_end .req x3
.loop_fold_end:
cmp x_len, 7
bls .size_4
sub x_buf_loop_size8_end, x_len, #8
and x_buf_loop_size8_end, x_buf_loop_size8_end, -8
add x_buf_loop_size8_end, x_buf_loop_size8_end, 8
add x_buf_loop_size8_end, x_buf_iter, x_buf_loop_size8_end
.align 3
.loop_size_8:
ldr x_tmp, [x_buf_iter], 8
crc32x w_crc, w_crc, x_tmp
cmp x_buf_iter, x_buf_loop_size8_end
bne .loop_size_8
and x_len, x_len, 7
.size_4:
cmp x_len, 3
bls .size_2
ldr w_tmp, [x_buf_iter], 4
crc32w w_crc, w_crc, w_tmp
sub x_len, x_len, #4
.size_2:
cmp x_len, 1
bls .size_1
ldrh w_tmp, [x_buf_iter], 2
crc32h w_crc, w_crc, w_tmp
sub x_len, x_len, #2
.size_1:
cbz x_len, .done
ldrb w_tmp, [x_buf_iter]
crc32b w_crc, w_crc, w_tmp
.done:
mvn w_crc, w_crc
ret
.size crc32_gzip_refl_hw_fold, .-crc32_gzip_refl_hw_fold