/********************************************************************** Copyright(c) 2020 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Arm Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************/ #include "../include/aarch64_label.h" .macro declare_generic_reg name:req, reg:req, default:req \name .req \default\reg w_\name .req w\reg x_\name .req x\reg .endm .macro declare_neon_reg name:req, reg:req, default:req \name .req \default\reg v_\name .req v\reg q_\name .req q\reg d_\name .req d\reg s_\name .req s\reg .endm /********************************************************************** variables **********************************************************************/ declare_generic_reg crc, 0,w declare_generic_reg buf, 1,x declare_generic_reg len, 2,x declare_generic_reg buf_saved, 3,x declare_generic_reg buf_iter, 4,x declare_generic_reg len_saved, 5,x declare_generic_reg buf_tmp, 6,x declare_generic_reg crc0, 7,x declare_generic_reg crc1, 8,x declare_generic_reg crc2, 9,x declare_generic_reg pconst, 10,x declare_generic_reg data_crc0, 11,x declare_generic_reg data_crc1, 12,x declare_generic_reg data_crc2, 13,x declare_generic_reg size, 9,x declare_generic_reg crc_tmp, 10,w declare_generic_reg size_tmp, 11,x declare_generic_reg data_tmp1, 11,x declare_generic_reg data_tmp2, 12,x declare_generic_reg data_tmp3, 13,x declare_generic_reg tmp, 14,x declare_generic_reg tmp1, 15,x // return declare_generic_reg ret_crc, 0,w /********************************************************************** simd variables **********************************************************************/ declare_neon_reg a0, 0,v declare_neon_reg a1, 1,v declare_neon_reg a2, 2,v declare_neon_reg a3, 3,v declare_neon_reg a4, 4,v declare_neon_reg a5, 16,v declare_neon_reg a6, 17,v declare_neon_reg a7, 18,v declare_neon_reg a8, 19,v declare_neon_reg y5, 20,v declare_neon_reg y6, 21,v declare_neon_reg y7, 22,v declare_neon_reg y8, 23,v declare_neon_reg neon_zero, 24,v declare_neon_reg neon_tmp, 24,v declare_neon_reg k5k0, 25,v declare_neon_reg neon_tmp1, 26,v declare_neon_reg neon_tmp2, 27,v declare_neon_reg neon_tmp3, 28,v declare_neon_reg crc_pmull, 29,v declare_neon_reg neon_crc0, 30,v declare_neon_reg neon_crc1, 31,v declare_neon_reg neon_const0, 5,v declare_neon_reg neon_const1, 6,v declare_neon_reg neon_const2, 7,v // constants .equ offset_k3k4, 16 .equ offset_k5k0, 32 .equ offset_poly, 48 .equ offset_crc32_const, 64 // pmull fold .macro pmull_fold ldr x_data_crc0, [x_buf_tmp, 464] ldr x_data_crc1, [x_buf_tmp, 976] ldr x_data_crc2, [x_buf_tmp, 1488] pmull v_a5.1q, v_a1.1d, v_a0.1d crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 ldr x_data_crc0, [x_buf_tmp, 472] ldr x_data_crc1, [x_buf_tmp, 984] ldr x_data_crc2, [x_buf_tmp, 1496] pmull v_a6.1q, v_a2.1d, v_a0.1d crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 ldr x_data_crc0, [x_buf_tmp, 480] ldr x_data_crc1, [x_buf_tmp, 992] ldr x_data_crc2, [x_buf_tmp, 1504] pmull v_a7.1q, v_a3.1d, v_a0.1d crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 ldr x_data_crc0, [x_buf_tmp, 488] ldr x_data_crc1, [x_buf_tmp, 1000] ldr x_data_crc2, [x_buf_tmp, 1512] pmull v_a8.1q, v_a4.1d, v_a0.1d crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 ldr x_data_crc0, [x_buf_tmp, 496] ldr x_data_crc1, [x_buf_tmp, 1008] ldr x_data_crc2, [x_buf_tmp, 1520] pmull2 v_a1.1q, v_a1.2d, v_a0.2d crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 ld1 {v_y5.4s, v_y6.4s, v_y7.4s, v_y8.4s}, [x_buf_tmp] ldr x_data_crc0, [x_buf_tmp, 504] ldr x_data_crc1, [x_buf_tmp, 1016] ldr x_data_crc2, [x_buf_tmp, 1528] pmull2 v_a2.1q, v_a2.2d, v_a0.2d crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 pmull2 v_a3.1q, v_a3.2d, v_a0.2d pmull2 v_a4.1q, v_a4.2d, v_a0.2d eor v_y5.16b, v_y5.16b, v_a5.16b eor v_y6.16b, v_y6.16b, v_a6.16b eor v_y7.16b, v_y7.16b, v_a7.16b eor v_y8.16b, v_y8.16b, v_a8.16b ldr x_data_crc0, [x_buf_tmp, 512] ldr x_data_crc1, [x_buf_tmp, 1024] ldr x_data_crc2, [x_buf_tmp, 1536] eor v_a1.16b, v_y5.16b, v_a1.16b eor v_a2.16b, v_y6.16b, v_a2.16b eor v_a3.16b, v_y7.16b, v_a3.16b eor v_a4.16b, v_y8.16b, v_a4.16b crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 ldr x_data_crc0, [x_buf_tmp, 520] ldr x_data_crc1, [x_buf_tmp, 1032] ldr x_data_crc2, [x_buf_tmp, 1544] crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 .endm // crc32 mix for 2048 byte input data .macro crc32_mix2048 fmov s_a1, w_crc movi v_neon_tmp.4s, 0 #ifndef __APPLE__ adrp x_pconst, lanchor_crc32 add x_buf_tmp, x_buf, 64 #else adrp x_pconst, lanchor_crc32@PAGE add x_buf_tmp, x_buf, 64 #endif ldr x_data_crc0, [x_buf, 512] ldr x_data_crc1, [x_buf, 1024] ldr x_data_crc2, [x_buf, 1536] crc32_u64 w_crc0, wzr, x_data_crc0 crc32_u64 w_crc1, wzr, x_data_crc1 crc32_u64 w_crc2, wzr, x_data_crc2 #ifdef CRC32 mvn v_a1.8b, v_a1.8b #endif ins v_neon_tmp.s[0], v_a1.s[0] ld1 {v_a1.4s, v_a2.4s, v_a3.4s, v_a4.4s}, [x_buf] ldr x_data_crc0, [x_buf, 520] ldr x_data_crc1, [x_buf, 1032] ldr x_data_crc2, [x_buf, 1544] eor v_a1.16b, v_a1.16b, v_neon_tmp.16b #ifndef __APPLE__ ldr q_a0, [x_pconst, #:lo12:lanchor_crc32] // k1k2 #else ldr q_a0, [x_pconst, #lanchor_crc32@PAGEOFF] // k1k2 #endif crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 // loop start, unroll the loop .align 4 pmull_fold add x_buf_tmp, x_buf_tmp, 64 pmull_fold add x_buf_tmp, x_buf_tmp, 64 pmull_fold add x_buf_tmp, x_buf_tmp, 64 pmull_fold add x_buf_tmp, x_buf_tmp, 64 pmull_fold add x_buf_tmp, x_buf_tmp, 64 pmull_fold add x_buf_tmp, x_buf_tmp, 64 pmull_fold // loop end // PMULL: fold into 128-bits #ifndef __APPLE__ add x_pconst, x_pconst, :lo12:lanchor_crc32 #else add x_pconst, x_pconst, lanchor_crc32@PAGEOFF #endif ldr x_data_crc0, [x_buf, 976] ldr x_data_crc1, [x_buf, 1488] ldr x_data_crc2, [x_buf, 2000] ldr q_a0, [x_pconst, offset_k3k4] // k3k4 crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 pmull v_a5.1q, v_a1.1d, v_a0.1d pmull2 v_a1.1q, v_a1.2d, v_a0.2d eor v_a1.16b, v_a5.16b, v_a1.16b eor v_a1.16b, v_a1.16b, v_a2.16b ldr x_data_crc0, [x_buf, 984] ldr x_data_crc1, [x_buf, 1496] ldr x_data_crc2, [x_buf, 2008] crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 pmull v_a5.1q, v_a1.1d, v_a0.1d pmull2 v_a1.1q, v_a1.2d, v_a0.2d ldr x_data_crc0, [x_buf, 992] ldr x_data_crc1, [x_buf, 1504] ldr x_data_crc2, [x_buf, 2016] eor v_a1.16b, v_a5.16b, v_a1.16b eor v_a1.16b, v_a1.16b, v_a3.16b crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 pmull v_a5.1q, v_a1.1d, v_a0.1d pmull2 v_a1.1q, v_a1.2d, v_a0.2d ldr x_data_crc0, [x_buf, 1000] ldr x_data_crc1, [x_buf, 1512] ldr x_data_crc2, [x_buf, 2024] eor v_a1.16b, v_a5.16b, v_a1.16b eor v_a1.16b, v_a1.16b, v_a4.16b // PMULL: fold 128-bits to 64-bits crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 dup d_a0, v_a0.d[1] pmull v_a2.1q, v_a1.1d, v_a0.1d movi v_neon_zero.4s, 0 ldr q_k5k0, [x_pconst, offset_k5k0] // k5k0 #ifndef __APPLE__ adrp x_tmp, .lanchor_mask #else adrp x_tmp, .lanchor_mask@PAGE #endif ldr x_data_crc0, [x_buf, 1008] ldr x_data_crc1, [x_buf, 1520] ldr x_data_crc2, [x_buf, 2032] ext v_a1.16b, v_a1.16b, v_neon_zero.16b, #8 eor v_a1.16b, v_a2.16b, v_a1.16b #ifndef __APPLE__ ldr q_neon_tmp3, [x_tmp, #:lo12:.lanchor_mask] #else ldr q_neon_tmp3, [x_tmp, #.lanchor_mask@PAGEOFF] #endif crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 dup d_a0, v_k5k0.d[1] pmull v_a3.1q, v_a2.1d, v_a0.1d ext v_a2.16b, v_a1.16b, v_neon_zero.16b, #4 and v_a1.16b, v_a1.16b, v_neon_tmp3.16b pmull v_a1.1q, v_a1.1d, v_k5k0.1d eor v_a1.16b, v_a2.16b, v_a1.16b // PMULL: barret reduce to 32-bits ldr q_neon_tmp1, [x_pconst, offset_poly] // poly ldr x_data_crc0, [x_buf, 1016] ldr x_data_crc1, [x_buf, 1528] ldr x_data_crc2, [x_buf, 2040] dup d_neon_tmp2, v_neon_tmp1.d[1] crc32_u64 w_crc0, w_crc0, x_data_crc0 crc32_u64 w_crc1, w_crc1, x_data_crc1 crc32_u64 w_crc2, w_crc2, x_data_crc2 and v_a2.16b, v_a1.16b, v_neon_tmp3.16b pmull v_a2.1q, v_a2.1d, v_neon_tmp2.1d and v_a2.16b, v_neon_tmp3.16b, v_a2.16b pmull v_a2.1q, v_a2.1d, v_neon_tmp1.1d // crc_pmull result eor v_a1.16b, v_a1.16b, v_a2.16b dup s_crc_pmull, v_a1.s[1] // merge crc_pmull, crc0, crc1, crc2 using pmull instruction fmov s_neon_crc0, w_crc0 fmov s_neon_crc1, w_crc1 ldr q_neon_const0, [x_pconst, offset_crc32_const] ldr q_neon_const1, [x_pconst, offset_crc32_const+16] ldr q_neon_const2, [x_pconst, offset_crc32_const+32] pmull v_crc_pmull.1q, v_crc_pmull.1d, v_neon_const0.1d pmull v_neon_crc0.1q, v_neon_crc0.1d, v_neon_const1.1d pmull v_neon_crc1.1q, v_neon_crc1.1d, v_neon_const2.1d fmov x_tmp1, d_neon_crc0 crc32_u64 w_crc0, wzr, x_tmp1 fmov x_tmp1, d_neon_crc1 crc32_u64 w_crc1, wzr, x_tmp1 eor w_ret_crc, w_crc1, w_crc0 fmov x_tmp1, d_crc_pmull crc32_u64 w_tmp, wzr, x_tmp1 eor w_crc2, w_tmp, w_crc2 // handle crc32/crc32c #ifdef CRC32 eon w_ret_crc, w_crc2, w_ret_crc #else eor w_ret_crc, w_crc2, w_ret_crc #endif .endm // crc32 mix main default .macro crc32_mix_main_default cmp x_len, 2047 mov x_len_saved, x_len mov x_buf_saved, x_buf bls .less_than_2048 sub x_buf_iter, x_len, #2048 stp x29, x30, [sp, -16]! mov x29, sp and x_buf_iter, x_buf_iter, -2048 add x_buf_iter, x_buf_iter, 2048 add x_buf_iter, x_buf, x_buf_iter .align 4 .loop_mix: mov x_buf, x_buf_saved crc32_mix2048 add x_buf_saved, x_buf_saved, 2048 cmp x_buf_saved, x_buf_iter bne .loop_mix and x_len_saved, x_len_saved, 2047 cbnz x_len_saved, .remain_ldp ldp x29, x30, [sp], 16 ret .align 4 .remain_ldp: mov w_crc_tmp, crc ldp x29, x30, [sp], 16 mov size, x_len_saved mov buf, x_buf_iter b .crc32_hw_handle .remain: mov w_crc_tmp, crc mov size, x_len_saved mov buf, x_buf_saved b .crc32_hw_handle .align 4 .less_than_2048: cbnz x_len, .remain ret .crc32_hw_handle: cmp size, 63 #ifdef CRC32 mvn crc_tmp, crc_tmp #endif bls .less_than_64 sub buf_saved, size, #64 and buf_saved, buf_saved, -64 add buf_saved, buf_saved, 64 add buf_saved, buf, buf_saved .align 4 .loop_64: ldp data_tmp1, data_tmp2, [buf] ldr data_tmp3, [buf, 16] crc32_u64 crc_tmp, crc_tmp, data_tmp1 crc32_u64 crc_tmp, crc_tmp, data_tmp2 ldp data_tmp1, data_tmp2, [buf, 24] add buf, buf, 64 crc32_u64 crc_tmp, crc_tmp, data_tmp3 ldr data_tmp3, [buf, -24] crc32_u64 crc_tmp, crc_tmp, data_tmp1 crc32_u64 crc_tmp, crc_tmp, data_tmp2 ldp data_tmp1, data_tmp2, [buf, -16] cmp buf_saved, buf crc32_u64 crc_tmp, crc_tmp, data_tmp3 crc32_u64 crc_tmp, crc_tmp, data_tmp1 crc32_u64 crc_tmp, crc_tmp, data_tmp2 bne .loop_64 and size, size, 63 .less_than_64: cmp size, 7 bls .crc32_hw_w ldr data_tmp2, [buf] sub size_tmp, size, #8 cmp size_tmp, 7 crc32_u64 crc_tmp, crc_tmp, data_tmp2 bls .crc32_hw_w_pre ldr data_tmp2, [buf, 8] sub data_tmp3, size, #16 cmp data_tmp3, 7 crc32_u64 crc_tmp, crc_tmp, data_tmp2 bls .crc32_hw_w_pre ldr data_tmp2, [buf, 16] sub data_tmp3, size, #24 cmp data_tmp3, 7 crc32_u64 crc_tmp, crc_tmp, data_tmp2 bls .crc32_hw_w_pre ldr data_tmp2, [buf, 24] sub data_tmp3, size, #32 cmp data_tmp3, 7 crc32_u64 crc_tmp, crc_tmp, data_tmp2 bls .crc32_hw_w_pre ldr data_tmp2, [buf, 32] sub data_tmp3, size, #40 cmp data_tmp3, 7 crc32_u64 crc_tmp, crc_tmp, data_tmp2 bls .crc32_hw_w_pre ldr data_tmp2, [buf, 40] sub data_tmp3, size, #48 cmp data_tmp3, 7 crc32_u64 crc_tmp, crc_tmp, data_tmp2 bls .crc32_hw_w_pre ldr data_tmp2, [buf, 48] crc32_u64 crc_tmp, crc_tmp, data_tmp2 .crc32_hw_w_pre: and size_tmp, size_tmp, -8 and size, size, 7 add size_tmp, size_tmp, 8 add buf, buf, size_tmp .crc32_hw_w: cmp size, 3 bls .crc32_hw_h ldr w_data_tmp2, [buf], 4 sub size, size, #4 crc32_u32 crc_tmp, crc_tmp, w_data_tmp2 .crc32_hw_h: cmp size, 1 bls .crc32_hw_b ldrh w_data_tmp2, [buf], 2 sub size, size, #2 crc32_u16 crc_tmp, crc_tmp, w_data_tmp2 .crc32_hw_b: cbz size, .crc32_hw_done ldrb w_data_tmp2, [buf] crc32_u8 crc_tmp, crc_tmp, w_data_tmp2 .crc32_hw_done: #ifdef CRC32 mvn ret_crc, crc_tmp #else mov ret_crc, crc_tmp #endif ret .endm