From a2fc2c000d2dd6872b330554506eafb20bb99561 Mon Sep 17 00:00:00 2001 From: Jerry Yu Date: Mon, 2 Mar 2020 13:34:44 +0800 Subject: [PATCH] crc32:Add optimization implementation for Neoverse N1 This patch is base on reference(1) algorithm with some changes. - Redefine the block number to two. - That's due to only two pipe-line can be used in CRC32 calculate. - Redefine the block size: - The block size of CRC is 1536B and PMULL is 512B - Interleave CRC and PMULL instructions. The optimization parameters are calculated base on reference(2) References: - https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf - https://developer.arm.com/docs/swog309707/a Change-Id: I1c9e593d59b521f56e4b3c807b396c083c181636 Signed-off-by: Jerry Yu --- crc/aarch64/Makefile.am | 6 +- crc/aarch64/crc32_common_mix_neoverse_n1.S | 434 +++++++++++++++++++++ crc/aarch64/crc32_mix_neoverse_n1.S | 66 ++++ crc/aarch64/crc32c_mix_neoverse_n1.S | 64 +++ crc/aarch64/crc_aarch64_dispatcher.c | 14 +- 5 files changed, 581 insertions(+), 3 deletions(-) create mode 100644 crc/aarch64/crc32_common_mix_neoverse_n1.S create mode 100644 crc/aarch64/crc32_mix_neoverse_n1.S create mode 100644 crc/aarch64/crc32c_mix_neoverse_n1.S diff --git a/crc/aarch64/Makefile.am b/crc/aarch64/Makefile.am index 57061f0..9fbb019 100644 --- a/crc/aarch64/Makefile.am +++ b/crc/aarch64/Makefile.am @@ -1,5 +1,5 @@ ######################################################################## -# Copyright(c) 2019 Arm Corporation All rights reserved. +# Copyright(c) 2020 Arm Corporation All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -44,4 +44,6 @@ lsrc_aarch64 += \ crc/aarch64/crc64_iso_refl_pmull.S \ crc/aarch64/crc64_iso_norm_pmull.S \ crc/aarch64/crc64_jones_refl_pmull.S \ - crc/aarch64/crc64_jones_norm_pmull.S + crc/aarch64/crc64_jones_norm_pmull.S \ + crc/aarch64/crc32_mix_neoverse_n1.S \ + crc/aarch64/crc32c_mix_neoverse_n1.S diff --git a/crc/aarch64/crc32_common_mix_neoverse_n1.S b/crc/aarch64/crc32_common_mix_neoverse_n1.S new file mode 100644 index 0000000..91b331d --- /dev/null +++ b/crc/aarch64/crc32_common_mix_neoverse_n1.S @@ -0,0 +1,434 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +.macro declare_var_vector_reg name:req,reg:req + \name\()_q .req q\reg + \name\()_v .req v\reg + \name\()_s .req s\reg + \name\()_d .req d\reg +.endm + declare_var_vector_reg k1k2,20 + declare_var_vector_reg k3k4,21 + declare_var_vector_reg poly,22 + declare_var_vector_reg k5k0,23 + declare_var_vector_reg mask,24 + declare_var_vector_reg fold_poly,25 + + declare_var_vector_reg tmp0,0 + declare_var_vector_reg tmp1,1 + declare_var_vector_reg tmp2,2 + declare_var_vector_reg tmp3,3 + declare_var_vector_reg tmp4,4 + declare_var_vector_reg tmp5,5 + declare_var_vector_reg tmp6,6 + declare_var_vector_reg tmp7,7 + declare_var_vector_reg pmull_data0,16 + declare_var_vector_reg pmull_data1,17 + declare_var_vector_reg pmull_data2,18 + declare_var_vector_reg pmull_data3,19 + + vzr .req v26 + + BUF .req x0 + LEN .req x1 + CRC .req x2 + wCRC .req w2 + const_addr .req x3 + crc_blk_ptr .req x4 + pmull_blk_ptr .req x5 + crc_data0 .req x6 + crc_data1 .req x7 + crc_data2 .req x19 + crc_data3 .req x20 + wPmull .req w21 + + data0 .req x4 + data1 .req x5 + data2 .req x6 + data3 .req x7 + wdata .req w4 + +.macro pmull_fold + + pmull2 tmp4_v.1q, tmp0_v.2d, k1k2_v.2d + pmull2 tmp5_v.1q, tmp1_v.2d, k1k2_v.2d + pmull2 tmp6_v.1q, tmp2_v.2d, k1k2_v.2d + pmull2 tmp7_v.1q, tmp3_v.2d, k1k2_v.2d + + pmull tmp0_v.1q, tmp0_v.1d, k1k2_v.1d + pmull tmp1_v.1q, tmp1_v.1d, k1k2_v.1d + pmull tmp2_v.1q, tmp2_v.1d, k1k2_v.1d + pmull tmp3_v.1q, tmp3_v.1d, k1k2_v.1d + ld1 {pmull_data0_v.16b-pmull_data3_v.16b},[pmull_blk_ptr],#64 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + + eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b + eor tmp1_v.16b, tmp1_v.16b, tmp5_v.16b + eor tmp2_v.16b, tmp2_v.16b, tmp6_v.16b + eor tmp3_v.16b, tmp3_v.16b, tmp7_v.16b + + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + eor tmp0_v.16b, tmp0_v.16b, v16.16b + eor tmp1_v.16b, tmp1_v.16b, v17.16b + eor tmp2_v.16b, tmp2_v.16b, v18.16b + eor tmp3_v.16b, tmp3_v.16b, v19.16b + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 +.endm + + + +.macro crc32_common_mix poly_type + .set MIX_BLK_SIZE,2048 + add pmull_blk_ptr,BUF,MIX_BLK_SIZE-512 +.ifc \poly_type,crc32 + mvn wCRC,wCRC +.endif + cmp LEN,MIX_BLK_SIZE-1 + mov pmull_blk_ptr,BUF + bls start_final + adr const_addr, .Lconstants + ld1 {k1k2_v.16b,k3k4_v.16b,poly_v.16b},[const_addr],#48 + mov crc_blk_ptr,BUF + movi vzr.16b, #0 + ld1 {k5k0_v.8b,mask_v.8b,fold_poly_v.8b},[const_addr] + +loop_2048: + mov crc_blk_ptr,BUF + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + add pmull_blk_ptr,pmull_blk_ptr,MIX_BLK_SIZE-512 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + ld1 {tmp0_v.16b-tmp3_v.16b}, [pmull_blk_ptr], #0x40 + sub LEN,LEN,MIX_BLK_SIZE + cmp LEN,MIX_BLK_SIZE + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + + pmull_fold + pmull_fold + pmull_fold + pmull_fold + pmull_fold + pmull_fold + pmull_fold + + /* Folding cache line into 128bit */ + pmull2 tmp4_v.1q, tmp0_v.2d, k3k4_v.2d + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + pmull tmp0_v.1q, tmp0_v.1d, k3k4_v.1d + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + pmull2 tmp4_v.1q, tmp0_v.2d, k3k4_v.2d + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + pmull tmp0_v.1q, tmp0_v.1d, k3k4_v.1d + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + eor tmp0_v.16b, tmp0_v.16b, tmp2_v.16b + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + pmull2 tmp4_v.1q, tmp0_v.2d, k3k4_v.2d + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + pmull tmp0_v.1q, tmp0_v.1d, k3k4_v.1d + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + eor tmp0_v.16b, tmp0_v.16b, tmp3_v.16b + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + + + /** + * perform the last 64 bit fold, also + * adds 32 zeroes to the input stream + */ + ext tmp1_v.16b, tmp0_v.16b, tmp0_v.16b, #8 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + pmull2 tmp1_v.1q, tmp1_v.2d, k3k4_v.2d + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + ext tmp0_v.16b, tmp0_v.16b, vzr.16b, #8 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + + /* final 32-bit fold */ + ext tmp1_v.16b, tmp0_v.16b, vzr.16b, #4 + and tmp0_v.16b, tmp0_v.16b, mask_v.16b + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + pmull tmp0_v.1q, tmp0_v.1d, k5k0_v.1d + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b + + /** + * Finish up with the bit-reversed barrett + * reduction 64 ==> 32 bits + */ + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + and tmp1_v.16b, tmp0_v.16b, mask_v.16b + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + ext tmp1_v.16b, vzr.16b, tmp1_v.16b, #8 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + pmull2 tmp1_v.1q, tmp1_v.2d, poly_v.2d + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + and tmp1_v.16b, tmp1_v.16b, mask_v.16b + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + pmull tmp1_v.1q, tmp1_v.1d, poly_v.1d + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + mov wPmull, tmp0_v.s[1] + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + ldp crc_data0,crc_data1,[crc_blk_ptr],16 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + ldp crc_data2,crc_data3,[crc_blk_ptr],16 + + crc32_u64 wCRC,wCRC,crc_data0 + crc32_u64 wCRC,wCRC,crc_data1 + crc32_u64 wCRC,wCRC,crc_data2 + crc32_u64 wCRC,wCRC,crc_data3 + + fmov d0, CRC + mov w6, 0 + pmull v0.1q, v0.1d, fold_poly_v.1d + fmov CRC, d0 + add BUF,BUF,MIX_BLK_SIZE + crc32_u64 w6, w6, CRC + eor wCRC, w6, wPmull + bge loop_2048 +start_final: + cmp LEN, 63 + bls .loop_16B +.loop_64B: + ldp data0, data1, [BUF],#16 + sub LEN,LEN,#64 + ldp data2, data3, [BUF],#16 + cmp LEN,#64 + crc32_u64 wCRC, wCRC, data0 + crc32_u64 wCRC, wCRC, data1 + ldp data0, data1, [BUF],#16 + crc32_u64 wCRC, wCRC, data2 + crc32_u64 wCRC, wCRC, data3 + ldp data2, data3, [BUF],#16 + crc32_u64 wCRC, wCRC, data0 + crc32_u64 wCRC, wCRC, data1 + crc32_u64 wCRC, wCRC, data2 + crc32_u64 wCRC, wCRC, data3 + bge .loop_64B + +.loop_16B: + cmp x1, 15 + bls .less_16B + ldp data0, data1, [BUF],#16 + sub LEN,LEN,#16 + cmp LEN,15 + crc32_u64 wCRC, wCRC, data0 + crc32_u64 wCRC, wCRC, data1 + bls .less_16B + ldp data0, data1, [BUF],#16 + sub LEN,LEN,#16 + cmp LEN,15 + crc32_u64 wCRC, wCRC, data0 + crc32_u64 wCRC, wCRC, data1 + bls .less_16B + ldp data0, data1, [BUF],#16 + sub LEN,LEN,#16 //MUST less than 16B + crc32_u64 wCRC, wCRC, data0 + crc32_u64 wCRC, wCRC, data1 +.less_16B: + cmp LEN, 7 + bls .less_8B + ldr data0, [BUF], 8 + sub LEN, LEN, #8 + crc32_u64 wCRC, wCRC, data0 +.less_8B: + cmp LEN, 3 + bls .less_4B + ldr wdata, [BUF], 4 + sub LEN, LEN, #4 + crc32_u32 wCRC, wCRC, wdata +.less_4B: + cmp LEN, 1 + bls .less_2B + ldrh wdata, [BUF], 2 + sub LEN, LEN, #2 + crc32_u16 wCRC, wCRC, wdata +.less_2B: + cbz LEN, .finish_exit + ldrb wdata, [BUF] + crc32_u8 wCRC, wCRC, wdata +.finish_exit: +.ifc \poly_type,crc32 + mvn w0, wCRC +.else + mov w0, wCRC +.endif + ret +.endm + diff --git a/crc/aarch64/crc32_mix_neoverse_n1.S b/crc/aarch64/crc32_mix_neoverse_n1.S new file mode 100644 index 0000000..fa29770 --- /dev/null +++ b/crc/aarch64/crc32_mix_neoverse_n1.S @@ -0,0 +1,66 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + .text + .align 6 + .arch armv8-a+crypto+crc + +#include "crc32_common_mix_neoverse_n1.S" +.Lconstants: + .octa 0x00000001c6e415960000000154442bd4 + .octa 0x00000000ccaa009e00000001751997d0 + .octa 0x00000001F701164100000001DB710641 + .quad 0x0000000163cd6124 + .quad 0x00000000FFFFFFFF + .quad 0x000000000c30f51d +.macro crc32_u64 dst,src,data + crc32x \dst,\src,\data +.endm +.macro crc32_u32 dst,src,data + crc32w \dst,\src,\data +.endm +.macro crc32_u16 dst,src,data + crc32h \dst,\src,\data +.endm +.macro crc32_u8 dst,src,data + crc32b \dst,\src,\data +.endm + + +/** + * uint32_t crc32_mix_neoverse_n1(uint32_t * BUF, + * size_t LEN, uint CRC) + */ + .align 6 + .global crc32_mix_neoverse_n1 + .type crc32_mix_neoverse_n1, %function +crc32_mix_neoverse_n1: + crc32_common_mix crc32 + .size crc32_mix_neoverse_n1, .-crc32_mix_neoverse_n1 + diff --git a/crc/aarch64/crc32c_mix_neoverse_n1.S b/crc/aarch64/crc32c_mix_neoverse_n1.S new file mode 100644 index 0000000..6982b39 --- /dev/null +++ b/crc/aarch64/crc32c_mix_neoverse_n1.S @@ -0,0 +1,64 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + .text + .align 6 + .arch armv8-a+crypto+crc + +#include "crc32_common_mix_neoverse_n1.S" +.Lconstants: + .octa 0x000000009e4addf800000000740eef02 + .octa 0x000000014cd00bd600000000f20c0dfe + .octa 0x00000000dea713f10000000105ec76f0 + .quad 0x00000000dd45aab8 + .quad 0x00000000FFFFFFFF + .quad 0x00000000dd7e3b0c + +.macro crc32_u64 dst,src,data + crc32cx \dst,\src,\data +.endm +.macro crc32_u32 dst,src,data + crc32cw \dst,\src,\data +.endm +.macro crc32_u16 dst,src,data + crc32ch \dst,\src,\data +.endm +.macro crc32_u8 dst,src,data + crc32cb \dst,\src,\data +.endm +/** + * uint32_t crc32c_mix_neoverse_n1(uint32_t * BUF, + * size_t LEN, uint CRC) + */ + .align 6 + .global crc32c_mix_neoverse_n1 + .type crc32c_mix_neoverse_n1, %function +crc32c_mix_neoverse_n1: + crc32_common_mix crc32c + .size crc32c_mix_neoverse_n1, .-crc32c_mix_neoverse_n1 diff --git a/crc/aarch64/crc_aarch64_dispatcher.c b/crc/aarch64/crc_aarch64_dispatcher.c index bac9eeb..2df0f28 100644 --- a/crc/aarch64/crc_aarch64_dispatcher.c +++ b/crc/aarch64/crc_aarch64_dispatcher.c @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2019 Arm Corporation All rights reserved. + Copyright(c) 2019-2020 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -62,6 +62,12 @@ DEFINE_INTERFACE_DISPATCHER(crc32_ieee) DEFINE_INTERFACE_DISPATCHER(crc32_iscsi) { unsigned long auxval = getauxval(AT_HWCAP); + if ((HWCAP_CRC32 | HWCAP_PMULL) == (auxval & (HWCAP_CRC32 | HWCAP_PMULL))) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): + return PROVIDER_INFO(crc32c_mix_neoverse_n1); + } + } if (auxval & HWCAP_CRC32) return PROVIDER_INFO(crc32_iscsi_refl_hw_fold); if (auxval & HWCAP_PMULL) { @@ -74,6 +80,12 @@ DEFINE_INTERFACE_DISPATCHER(crc32_iscsi) DEFINE_INTERFACE_DISPATCHER(crc32_gzip_refl) { unsigned long auxval = getauxval(AT_HWCAP); + if ((HWCAP_CRC32 | HWCAP_PMULL) == (auxval & (HWCAP_CRC32 | HWCAP_PMULL))) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): + return PROVIDER_INFO(crc32_mix_neoverse_n1); + } + } if (auxval & HWCAP_CRC32) return PROVIDER_INFO(crc32_gzip_refl_hw_fold); if (auxval & HWCAP_PMULL)