mirror of
https://github.com/intel/isa-l.git
synced 2025-01-19 04:26:08 +01:00
1187583a97
- It should be fine to enable pmull always on Apple Silicon - macOS 12+ is required for PMULL instruction. - Changed the conditional macro to __APPLE__ - Rewritten dispatcher using sysctlbyname - Use __USER_LABEL_PREFIX__ - Use __TEXT,__const as readonly section - use ASM_DEF_RODATA macro - fix func decl Change-Id: I800593f21085d8187b480c8bb3ab2bd70c4a6974 Signed-off-by: Taiju Yamada <tyamada@bi.a.u-tokyo.ac.jp>
587 lines
14 KiB
ArmAsm
587 lines
14 KiB
ArmAsm
/**********************************************************************
|
|
Copyright(c) 2020 Arm Corporation All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions
|
|
are met:
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in
|
|
the documentation and/or other materials provided with the
|
|
distribution.
|
|
* Neither the name of Arm Corporation nor the names of its
|
|
contributors may be used to endorse or promote products derived
|
|
from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
**********************************************************************/
|
|
|
|
#include "../include/aarch64_label.h"
|
|
|
|
.macro declare_generic_reg name:req, reg:req, default:req
|
|
\name .req \default\reg
|
|
w_\name .req w\reg
|
|
x_\name .req x\reg
|
|
.endm
|
|
|
|
.macro declare_neon_reg name:req, reg:req, default:req
|
|
\name .req \default\reg
|
|
v_\name .req v\reg
|
|
q_\name .req q\reg
|
|
d_\name .req d\reg
|
|
s_\name .req s\reg
|
|
.endm
|
|
|
|
/**********************************************************************
|
|
variables
|
|
**********************************************************************/
|
|
declare_generic_reg crc, 0,w
|
|
declare_generic_reg buf, 1,x
|
|
declare_generic_reg len, 2,x
|
|
declare_generic_reg buf_saved, 3,x
|
|
declare_generic_reg buf_iter, 4,x
|
|
declare_generic_reg len_saved, 5,x
|
|
declare_generic_reg buf_tmp, 6,x
|
|
|
|
declare_generic_reg crc0, 7,x
|
|
declare_generic_reg crc1, 8,x
|
|
declare_generic_reg crc2, 9,x
|
|
declare_generic_reg pconst, 10,x
|
|
declare_generic_reg data_crc0, 11,x
|
|
declare_generic_reg data_crc1, 12,x
|
|
declare_generic_reg data_crc2, 13,x
|
|
|
|
declare_generic_reg size, 9,x
|
|
declare_generic_reg crc_tmp, 10,w
|
|
declare_generic_reg size_tmp, 11,x
|
|
declare_generic_reg data_tmp1, 11,x
|
|
declare_generic_reg data_tmp2, 12,x
|
|
declare_generic_reg data_tmp3, 13,x
|
|
|
|
declare_generic_reg tmp, 14,x
|
|
declare_generic_reg tmp1, 15,x
|
|
|
|
// return
|
|
declare_generic_reg ret_crc, 0,w
|
|
|
|
/**********************************************************************
|
|
simd variables
|
|
**********************************************************************/
|
|
declare_neon_reg a0, 0,v
|
|
declare_neon_reg a1, 1,v
|
|
declare_neon_reg a2, 2,v
|
|
declare_neon_reg a3, 3,v
|
|
declare_neon_reg a4, 4,v
|
|
|
|
declare_neon_reg a5, 16,v
|
|
declare_neon_reg a6, 17,v
|
|
declare_neon_reg a7, 18,v
|
|
declare_neon_reg a8, 19,v
|
|
|
|
declare_neon_reg y5, 20,v
|
|
declare_neon_reg y6, 21,v
|
|
declare_neon_reg y7, 22,v
|
|
declare_neon_reg y8, 23,v
|
|
|
|
declare_neon_reg neon_zero, 24,v
|
|
declare_neon_reg neon_tmp, 24,v
|
|
|
|
declare_neon_reg k5k0, 25,v
|
|
declare_neon_reg neon_tmp1, 26,v
|
|
declare_neon_reg neon_tmp2, 27,v
|
|
declare_neon_reg neon_tmp3, 28,v
|
|
|
|
declare_neon_reg crc_pmull, 29,v
|
|
declare_neon_reg neon_crc0, 30,v
|
|
declare_neon_reg neon_crc1, 31,v
|
|
|
|
declare_neon_reg neon_const0, 5,v
|
|
declare_neon_reg neon_const1, 6,v
|
|
declare_neon_reg neon_const2, 7,v
|
|
|
|
// constants
|
|
.equ offset_k3k4, 16
|
|
.equ offset_k5k0, 32
|
|
.equ offset_poly, 48
|
|
.equ offset_crc32_const, 64
|
|
|
|
// pmull fold
|
|
.macro pmull_fold
|
|
ldr x_data_crc0, [x_buf_tmp, 464]
|
|
ldr x_data_crc1, [x_buf_tmp, 976]
|
|
ldr x_data_crc2, [x_buf_tmp, 1488]
|
|
|
|
pmull v_a5.1q, v_a1.1d, v_a0.1d
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
ldr x_data_crc0, [x_buf_tmp, 472]
|
|
ldr x_data_crc1, [x_buf_tmp, 984]
|
|
ldr x_data_crc2, [x_buf_tmp, 1496]
|
|
|
|
pmull v_a6.1q, v_a2.1d, v_a0.1d
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
ldr x_data_crc0, [x_buf_tmp, 480]
|
|
ldr x_data_crc1, [x_buf_tmp, 992]
|
|
ldr x_data_crc2, [x_buf_tmp, 1504]
|
|
|
|
pmull v_a7.1q, v_a3.1d, v_a0.1d
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
ldr x_data_crc0, [x_buf_tmp, 488]
|
|
ldr x_data_crc1, [x_buf_tmp, 1000]
|
|
ldr x_data_crc2, [x_buf_tmp, 1512]
|
|
|
|
pmull v_a8.1q, v_a4.1d, v_a0.1d
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
ldr x_data_crc0, [x_buf_tmp, 496]
|
|
ldr x_data_crc1, [x_buf_tmp, 1008]
|
|
ldr x_data_crc2, [x_buf_tmp, 1520]
|
|
|
|
pmull2 v_a1.1q, v_a1.2d, v_a0.2d
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
ld1 {v_y5.4s, v_y6.4s, v_y7.4s, v_y8.4s}, [x_buf_tmp]
|
|
|
|
ldr x_data_crc0, [x_buf_tmp, 504]
|
|
ldr x_data_crc1, [x_buf_tmp, 1016]
|
|
ldr x_data_crc2, [x_buf_tmp, 1528]
|
|
|
|
pmull2 v_a2.1q, v_a2.2d, v_a0.2d
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
pmull2 v_a3.1q, v_a3.2d, v_a0.2d
|
|
pmull2 v_a4.1q, v_a4.2d, v_a0.2d
|
|
|
|
eor v_y5.16b, v_y5.16b, v_a5.16b
|
|
eor v_y6.16b, v_y6.16b, v_a6.16b
|
|
eor v_y7.16b, v_y7.16b, v_a7.16b
|
|
eor v_y8.16b, v_y8.16b, v_a8.16b
|
|
|
|
ldr x_data_crc0, [x_buf_tmp, 512]
|
|
ldr x_data_crc1, [x_buf_tmp, 1024]
|
|
ldr x_data_crc2, [x_buf_tmp, 1536]
|
|
|
|
eor v_a1.16b, v_y5.16b, v_a1.16b
|
|
eor v_a2.16b, v_y6.16b, v_a2.16b
|
|
eor v_a3.16b, v_y7.16b, v_a3.16b
|
|
eor v_a4.16b, v_y8.16b, v_a4.16b
|
|
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
ldr x_data_crc0, [x_buf_tmp, 520]
|
|
ldr x_data_crc1, [x_buf_tmp, 1032]
|
|
ldr x_data_crc2, [x_buf_tmp, 1544]
|
|
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
.endm
|
|
|
|
// crc32 mix for 2048 byte input data
|
|
.macro crc32_mix2048
|
|
fmov s_a1, w_crc
|
|
movi v_neon_tmp.4s, 0
|
|
|
|
#ifndef __APPLE__
|
|
adrp x_pconst, lanchor_crc32
|
|
add x_buf_tmp, x_buf, 64
|
|
#else
|
|
adrp x_pconst, lanchor_crc32@PAGE
|
|
add x_buf_tmp, x_buf, 64
|
|
#endif
|
|
|
|
ldr x_data_crc0, [x_buf, 512]
|
|
ldr x_data_crc1, [x_buf, 1024]
|
|
ldr x_data_crc2, [x_buf, 1536]
|
|
|
|
crc32_u64 w_crc0, wzr, x_data_crc0
|
|
crc32_u64 w_crc1, wzr, x_data_crc1
|
|
crc32_u64 w_crc2, wzr, x_data_crc2
|
|
|
|
#ifdef CRC32
|
|
mvn v_a1.8b, v_a1.8b
|
|
#endif
|
|
|
|
ins v_neon_tmp.s[0], v_a1.s[0]
|
|
|
|
ld1 {v_a1.4s, v_a2.4s, v_a3.4s, v_a4.4s}, [x_buf]
|
|
|
|
ldr x_data_crc0, [x_buf, 520]
|
|
ldr x_data_crc1, [x_buf, 1032]
|
|
ldr x_data_crc2, [x_buf, 1544]
|
|
|
|
eor v_a1.16b, v_a1.16b, v_neon_tmp.16b
|
|
#ifndef __APPLE__
|
|
ldr q_a0, [x_pconst, #:lo12:lanchor_crc32] // k1k2
|
|
#else
|
|
ldr q_a0, [x_pconst, #lanchor_crc32@PAGEOFF] // k1k2
|
|
#endif
|
|
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
// loop start, unroll the loop
|
|
.align 4
|
|
pmull_fold
|
|
|
|
add x_buf_tmp, x_buf_tmp, 64
|
|
pmull_fold
|
|
|
|
add x_buf_tmp, x_buf_tmp, 64
|
|
pmull_fold
|
|
|
|
add x_buf_tmp, x_buf_tmp, 64
|
|
pmull_fold
|
|
|
|
add x_buf_tmp, x_buf_tmp, 64
|
|
pmull_fold
|
|
|
|
add x_buf_tmp, x_buf_tmp, 64
|
|
pmull_fold
|
|
|
|
add x_buf_tmp, x_buf_tmp, 64
|
|
pmull_fold
|
|
// loop end
|
|
|
|
// PMULL: fold into 128-bits
|
|
#ifndef __APPLE__
|
|
add x_pconst, x_pconst, :lo12:lanchor_crc32
|
|
#else
|
|
add x_pconst, x_pconst, lanchor_crc32@PAGEOFF
|
|
#endif
|
|
|
|
ldr x_data_crc0, [x_buf, 976]
|
|
ldr x_data_crc1, [x_buf, 1488]
|
|
ldr x_data_crc2, [x_buf, 2000]
|
|
|
|
ldr q_a0, [x_pconst, offset_k3k4] // k3k4
|
|
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
pmull v_a5.1q, v_a1.1d, v_a0.1d
|
|
pmull2 v_a1.1q, v_a1.2d, v_a0.2d
|
|
|
|
eor v_a1.16b, v_a5.16b, v_a1.16b
|
|
eor v_a1.16b, v_a1.16b, v_a2.16b
|
|
|
|
ldr x_data_crc0, [x_buf, 984]
|
|
ldr x_data_crc1, [x_buf, 1496]
|
|
ldr x_data_crc2, [x_buf, 2008]
|
|
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
pmull v_a5.1q, v_a1.1d, v_a0.1d
|
|
pmull2 v_a1.1q, v_a1.2d, v_a0.2d
|
|
|
|
ldr x_data_crc0, [x_buf, 992]
|
|
ldr x_data_crc1, [x_buf, 1504]
|
|
ldr x_data_crc2, [x_buf, 2016]
|
|
|
|
eor v_a1.16b, v_a5.16b, v_a1.16b
|
|
eor v_a1.16b, v_a1.16b, v_a3.16b
|
|
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
pmull v_a5.1q, v_a1.1d, v_a0.1d
|
|
pmull2 v_a1.1q, v_a1.2d, v_a0.2d
|
|
|
|
ldr x_data_crc0, [x_buf, 1000]
|
|
ldr x_data_crc1, [x_buf, 1512]
|
|
ldr x_data_crc2, [x_buf, 2024]
|
|
|
|
eor v_a1.16b, v_a5.16b, v_a1.16b
|
|
eor v_a1.16b, v_a1.16b, v_a4.16b
|
|
|
|
// PMULL: fold 128-bits to 64-bits
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
dup d_a0, v_a0.d[1]
|
|
pmull v_a2.1q, v_a1.1d, v_a0.1d
|
|
|
|
movi v_neon_zero.4s, 0
|
|
ldr q_k5k0, [x_pconst, offset_k5k0] // k5k0
|
|
#ifndef __APPLE__
|
|
adrp x_tmp, .lanchor_mask
|
|
#else
|
|
adrp x_tmp, .lanchor_mask@PAGE
|
|
#endif
|
|
|
|
ldr x_data_crc0, [x_buf, 1008]
|
|
ldr x_data_crc1, [x_buf, 1520]
|
|
ldr x_data_crc2, [x_buf, 2032]
|
|
|
|
ext v_a1.16b, v_a1.16b, v_neon_zero.16b, #8
|
|
eor v_a1.16b, v_a2.16b, v_a1.16b
|
|
#ifndef __APPLE__
|
|
ldr q_neon_tmp3, [x_tmp, #:lo12:.lanchor_mask]
|
|
#else
|
|
ldr q_neon_tmp3, [x_tmp, #.lanchor_mask@PAGEOFF]
|
|
#endif
|
|
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
dup d_a0, v_k5k0.d[1]
|
|
pmull v_a3.1q, v_a2.1d, v_a0.1d
|
|
|
|
ext v_a2.16b, v_a1.16b, v_neon_zero.16b, #4
|
|
and v_a1.16b, v_a1.16b, v_neon_tmp3.16b
|
|
pmull v_a1.1q, v_a1.1d, v_k5k0.1d
|
|
eor v_a1.16b, v_a2.16b, v_a1.16b
|
|
|
|
// PMULL: barret reduce to 32-bits
|
|
ldr q_neon_tmp1, [x_pconst, offset_poly] // poly
|
|
|
|
ldr x_data_crc0, [x_buf, 1016]
|
|
ldr x_data_crc1, [x_buf, 1528]
|
|
ldr x_data_crc2, [x_buf, 2040]
|
|
|
|
dup d_neon_tmp2, v_neon_tmp1.d[1]
|
|
|
|
crc32_u64 w_crc0, w_crc0, x_data_crc0
|
|
crc32_u64 w_crc1, w_crc1, x_data_crc1
|
|
crc32_u64 w_crc2, w_crc2, x_data_crc2
|
|
|
|
and v_a2.16b, v_a1.16b, v_neon_tmp3.16b
|
|
pmull v_a2.1q, v_a2.1d, v_neon_tmp2.1d
|
|
and v_a2.16b, v_neon_tmp3.16b, v_a2.16b
|
|
pmull v_a2.1q, v_a2.1d, v_neon_tmp1.1d
|
|
|
|
// crc_pmull result
|
|
eor v_a1.16b, v_a1.16b, v_a2.16b
|
|
dup s_crc_pmull, v_a1.s[1]
|
|
|
|
// merge crc_pmull, crc0, crc1, crc2 using pmull instruction
|
|
fmov s_neon_crc0, w_crc0
|
|
fmov s_neon_crc1, w_crc1
|
|
|
|
ldr q_neon_const0, [x_pconst, offset_crc32_const]
|
|
ldr q_neon_const1, [x_pconst, offset_crc32_const+16]
|
|
ldr q_neon_const2, [x_pconst, offset_crc32_const+32]
|
|
|
|
pmull v_crc_pmull.1q, v_crc_pmull.1d, v_neon_const0.1d
|
|
pmull v_neon_crc0.1q, v_neon_crc0.1d, v_neon_const1.1d
|
|
pmull v_neon_crc1.1q, v_neon_crc1.1d, v_neon_const2.1d
|
|
|
|
fmov x_tmp1, d_neon_crc0
|
|
crc32_u64 w_crc0, wzr, x_tmp1
|
|
|
|
fmov x_tmp1, d_neon_crc1
|
|
crc32_u64 w_crc1, wzr, x_tmp1
|
|
|
|
eor w_ret_crc, w_crc1, w_crc0
|
|
|
|
fmov x_tmp1, d_crc_pmull
|
|
crc32_u64 w_tmp, wzr, x_tmp1
|
|
|
|
eor w_crc2, w_tmp, w_crc2
|
|
|
|
// handle crc32/crc32c
|
|
#ifdef CRC32
|
|
eon w_ret_crc, w_crc2, w_ret_crc
|
|
#else
|
|
eor w_ret_crc, w_crc2, w_ret_crc
|
|
#endif
|
|
.endm
|
|
|
|
// crc32 mix main default
|
|
.macro crc32_mix_main_default
|
|
cmp x_len, 2047
|
|
mov x_len_saved, x_len
|
|
mov x_buf_saved, x_buf
|
|
bls .less_than_2048
|
|
|
|
sub x_buf_iter, x_len, #2048
|
|
stp x29, x30, [sp, -16]!
|
|
|
|
mov x29, sp
|
|
and x_buf_iter, x_buf_iter, -2048
|
|
add x_buf_iter, x_buf_iter, 2048
|
|
add x_buf_iter, x_buf, x_buf_iter
|
|
|
|
.align 4
|
|
.loop_mix:
|
|
mov x_buf, x_buf_saved
|
|
crc32_mix2048
|
|
|
|
add x_buf_saved, x_buf_saved, 2048
|
|
cmp x_buf_saved, x_buf_iter
|
|
bne .loop_mix
|
|
|
|
and x_len_saved, x_len_saved, 2047
|
|
cbnz x_len_saved, .remain_ldp
|
|
|
|
ldp x29, x30, [sp], 16
|
|
ret
|
|
|
|
.align 4
|
|
.remain_ldp:
|
|
mov w_crc_tmp, crc
|
|
ldp x29, x30, [sp], 16
|
|
mov size, x_len_saved
|
|
mov buf, x_buf_iter
|
|
b .crc32_hw_handle
|
|
|
|
.remain:
|
|
mov w_crc_tmp, crc
|
|
mov size, x_len_saved
|
|
mov buf, x_buf_saved
|
|
b .crc32_hw_handle
|
|
|
|
.align 4
|
|
.less_than_2048:
|
|
cbnz x_len, .remain
|
|
ret
|
|
|
|
.crc32_hw_handle:
|
|
cmp size, 63
|
|
|
|
#ifdef CRC32
|
|
mvn crc_tmp, crc_tmp
|
|
#endif
|
|
|
|
bls .less_than_64
|
|
sub buf_saved, size, #64
|
|
and buf_saved, buf_saved, -64
|
|
add buf_saved, buf_saved, 64
|
|
add buf_saved, buf, buf_saved
|
|
|
|
.align 4
|
|
.loop_64:
|
|
ldp data_tmp1, data_tmp2, [buf]
|
|
ldr data_tmp3, [buf, 16]
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp1
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp2
|
|
|
|
ldp data_tmp1, data_tmp2, [buf, 24]
|
|
add buf, buf, 64
|
|
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp3
|
|
ldr data_tmp3, [buf, -24]
|
|
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp1
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp2
|
|
|
|
ldp data_tmp1, data_tmp2, [buf, -16]
|
|
cmp buf_saved, buf
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp3
|
|
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp1
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp2
|
|
bne .loop_64
|
|
|
|
and size, size, 63
|
|
.less_than_64:
|
|
cmp size, 7
|
|
bls .crc32_hw_w
|
|
|
|
ldr data_tmp2, [buf]
|
|
sub size_tmp, size, #8
|
|
cmp size_tmp, 7
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp2
|
|
bls .crc32_hw_w_pre
|
|
|
|
ldr data_tmp2, [buf, 8]
|
|
sub data_tmp3, size, #16
|
|
cmp data_tmp3, 7
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp2
|
|
bls .crc32_hw_w_pre
|
|
|
|
ldr data_tmp2, [buf, 16]
|
|
sub data_tmp3, size, #24
|
|
cmp data_tmp3, 7
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp2
|
|
bls .crc32_hw_w_pre
|
|
|
|
ldr data_tmp2, [buf, 24]
|
|
sub data_tmp3, size, #32
|
|
cmp data_tmp3, 7
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp2
|
|
bls .crc32_hw_w_pre
|
|
|
|
ldr data_tmp2, [buf, 32]
|
|
sub data_tmp3, size, #40
|
|
cmp data_tmp3, 7
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp2
|
|
bls .crc32_hw_w_pre
|
|
|
|
ldr data_tmp2, [buf, 40]
|
|
sub data_tmp3, size, #48
|
|
cmp data_tmp3, 7
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp2
|
|
bls .crc32_hw_w_pre
|
|
|
|
ldr data_tmp2, [buf, 48]
|
|
crc32_u64 crc_tmp, crc_tmp, data_tmp2
|
|
|
|
.crc32_hw_w_pre:
|
|
and size_tmp, size_tmp, -8
|
|
and size, size, 7
|
|
add size_tmp, size_tmp, 8
|
|
add buf, buf, size_tmp
|
|
|
|
.crc32_hw_w:
|
|
cmp size, 3
|
|
bls .crc32_hw_h
|
|
ldr w_data_tmp2, [buf], 4
|
|
sub size, size, #4
|
|
crc32_u32 crc_tmp, crc_tmp, w_data_tmp2
|
|
|
|
.crc32_hw_h:
|
|
cmp size, 1
|
|
bls .crc32_hw_b
|
|
ldrh w_data_tmp2, [buf], 2
|
|
sub size, size, #2
|
|
crc32_u16 crc_tmp, crc_tmp, w_data_tmp2
|
|
|
|
.crc32_hw_b:
|
|
cbz size, .crc32_hw_done
|
|
ldrb w_data_tmp2, [buf]
|
|
crc32_u8 crc_tmp, crc_tmp, w_data_tmp2
|
|
|
|
.crc32_hw_done:
|
|
#ifdef CRC32
|
|
mvn ret_crc, crc_tmp
|
|
#else
|
|
mov ret_crc, crc_tmp
|
|
#endif
|
|
ret
|
|
.endm
|