mirror of
https://github.com/intel/isa-l.git
synced 2025-01-22 05:20:02 +01:00
4785428d2f
+ Utilise `pmull2` instruction in main loops of arm64 crc functions and avoid the need for `dup` to align multiplicands. + Use just 1 ASIMD register to hold both 64b p4 constants, appropriately aligned. + Interleave quadword `ldr` with `pmull{2}` to avoid unnecessary stalls on existing LITTLE uarch (which can only issue these instructions every other cycle). + Similarly interleave scalar instructions with ASIMD instructions to increase likelihood of instruction level parallelism on a variety of uarch. + Cut down on needless instructions in non-critical sections to help performance for small buffers. + Extract common instruction sequences into inner macros and moved them into shared header - crc_common_pmull.h + Use the same human readable register aliases and register allocation in all 4 implementations, never refer to registers without using human readable alias. + Use #defines rather than .req to allow use of same names across several implementations + Reduce tail case size from 1024B to 64B + Phrased the `eor` instructions in the main loop to more clearly show that we can rewrite pairs of `eor` instructions with a single `eor3` instruction in the presence of Armv8.2-SHA (should probably be an option in multibinary in future). Change-Id: I3688193ea4ad88b53cf47e5bd9a7fd5c2b4401e1 Signed-off-by: Samuel Lee <samuel.lee@microsoft.com>
302 lines
8.2 KiB
C
302 lines
8.2 KiB
C
########################################################################
|
|
# Copyright (c) 2019 Microsoft Corporation.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in
|
|
# the documentation and/or other materials provided with the
|
|
# distribution.
|
|
# * Neither the name of Microsoft Corporation nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#########################################################################
|
|
|
|
// parameters
|
|
#define w_seed w0
|
|
#define x_seed x0
|
|
#define x_buf x1
|
|
#define w_len w2
|
|
#define x_len x2
|
|
|
|
// return
|
|
#define w_crc_ret w0
|
|
#define x_crc_ret x0
|
|
|
|
// constant
|
|
#define FOLD_SIZE 64
|
|
|
|
// global variables
|
|
#define x_buf_end x3
|
|
#define w_counter w4
|
|
#define x_counter x4
|
|
#define x_buf_iter x5
|
|
#define x_crc_tab_addr x6
|
|
#define x_tmp2 x6
|
|
#define w_tmp w7
|
|
#define x_tmp x7
|
|
|
|
#define v_x0 v0
|
|
#define d_x0 d0
|
|
#define s_x0 s0
|
|
|
|
#define q_x1 q1
|
|
#define v_x1 v1
|
|
|
|
#define q_x2 q2
|
|
#define v_x2 v2
|
|
|
|
#define q_x3 q3
|
|
#define v_x3 v3
|
|
#define d_x3 d3
|
|
#define s_x3 s3
|
|
|
|
#define q_y0 q4
|
|
#define v_y0 v4
|
|
#define v_tmp_high v4
|
|
#define d_tmp_high d4
|
|
|
|
#define q_y1 q5
|
|
#define v_y1 v5
|
|
#define v_tmp_low v5
|
|
|
|
#define q_y2 q6
|
|
#define v_y2 v6
|
|
|
|
#define q_y3 q7
|
|
#define v_y3 v7
|
|
|
|
#define q_x0_tmp q30
|
|
#define v_x0_tmp v30
|
|
#define d_p4_high v30.d[1]
|
|
#define d_p4_low d30
|
|
#define v_p4 v30
|
|
#define d_p1_high v30.d[1]
|
|
#define d_p1_low d30
|
|
#define v_p1 v30
|
|
#define d_p0_high v30.d[1]
|
|
#define d_p0_low d30
|
|
#define v_p0 v30
|
|
#define d_br_low d30
|
|
#define d_br_low2 v30.d[1]
|
|
#define v_br_low v30
|
|
|
|
#define q_shuffle q31
|
|
#define v_shuffle v31
|
|
#define d_br_high d31
|
|
#define d_br_high2 v31.d[1]
|
|
#define v_br_high v31
|
|
#define d_p0_low2 d31
|
|
#define d_p0_high2 v31.d[1]
|
|
#define v_p02 v31
|
|
|
|
#define v_x0_high v16
|
|
#define v_x1_high v17
|
|
#define v_x2_high v18
|
|
#define v_x3_high v19
|
|
|
|
.macro crc_refl_load_first_block
|
|
ldr q_x0_tmp, [x_buf]
|
|
ldr q_x1, [x_buf, 16]
|
|
ldr q_x2, [x_buf, 32]
|
|
ldr q_x3, [x_buf, 48]
|
|
|
|
and x_counter, x_len, -64
|
|
sub x_tmp, x_counter, #64
|
|
cmp x_tmp, 63
|
|
|
|
add x_buf_iter, x_buf, 64
|
|
|
|
eor v_x0.16b, v_x0.16b, v_x0_tmp.16b
|
|
.endm
|
|
|
|
.macro crc_norm_load_first_block
|
|
adrp x_tmp, .shuffle_data
|
|
ldr q_shuffle, [x_tmp, #:lo12:.shuffle_data]
|
|
|
|
ldr q_x0_tmp, [x_buf]
|
|
ldr q_x1, [x_buf, 16]
|
|
ldr q_x2, [x_buf, 32]
|
|
ldr q_x3, [x_buf, 48]
|
|
|
|
and x_counter, x_len, -64
|
|
sub x_tmp, x_counter, #64
|
|
cmp x_tmp, 63
|
|
|
|
add x_buf_iter, x_buf, 64
|
|
|
|
tbl v_x0_tmp.16b, {v_x0_tmp.16b}, v_shuffle.16b
|
|
tbl v_x1.16b, {v_x1.16b}, v_shuffle.16b
|
|
tbl v_x2.16b, {v_x2.16b}, v_shuffle.16b
|
|
tbl v_x3.16b, {v_x3.16b}, v_shuffle.16b
|
|
|
|
eor v_x0.16b, v_x0.16b, v_x0_tmp.16b
|
|
.endm
|
|
|
|
.macro crc32_load_p4
|
|
add x_buf_end, x_buf_iter, x_tmp
|
|
|
|
mov x_tmp, p4_low_b0
|
|
movk x_tmp, p4_low_b1, lsl 16
|
|
fmov d_p4_low, x_tmp
|
|
|
|
mov x_tmp2, p4_high_b0
|
|
movk x_tmp2, p4_high_b1, lsl 16
|
|
fmov d_p4_high, x_tmp2
|
|
.endm
|
|
|
|
.macro crc64_load_p4
|
|
add x_buf_end, x_buf_iter, x_tmp
|
|
|
|
mov x_tmp, p4_low_b0
|
|
movk x_tmp, p4_low_b1, lsl 16
|
|
movk x_tmp, p4_low_b2, lsl 32
|
|
movk x_tmp, p4_low_b3, lsl 48
|
|
fmov d_p4_low, x_tmp
|
|
|
|
mov x_tmp2, p4_high_b0
|
|
movk x_tmp2, p4_high_b1, lsl 16
|
|
movk x_tmp2, p4_high_b2, lsl 32
|
|
movk x_tmp2, p4_high_b3, lsl 48
|
|
fmov d_p4_high, x_tmp2
|
|
.endm
|
|
|
|
.macro crc_refl_loop
|
|
.align 3
|
|
.clmul_loop:
|
|
// interleave ldr and pmull(2) for arch which can only issue quadword load every
|
|
// other cycle (i.e. A55)
|
|
ldr q_y0, [x_buf_iter]
|
|
pmull2 v_x0_high.1q, v_x0.2d, v_p4.2d
|
|
ldr q_y1, [x_buf_iter, 16]
|
|
pmull2 v_x1_high.1q, v_x1.2d, v_p4.2d
|
|
ldr q_y2, [x_buf_iter, 32]
|
|
pmull2 v_x2_high.1q, v_x2.2d, v_p4.2d
|
|
ldr q_y3, [x_buf_iter, 48]
|
|
pmull2 v_x3_high.1q, v_x3.2d, v_p4.2d
|
|
|
|
pmull v_x0.1q, v_x0.1d, v_p4.1d
|
|
add x_buf_iter, x_buf_iter, 64
|
|
pmull v_x1.1q, v_x1.1d, v_p4.1d
|
|
cmp x_buf_iter, x_buf_end
|
|
pmull v_x2.1q, v_x2.1d, v_p4.1d
|
|
pmull v_x3.1q, v_x3.1d, v_p4.1d
|
|
|
|
eor v_x0.16b, v_x0.16b, v_x0_high.16b
|
|
eor v_x1.16b, v_x1.16b, v_x1_high.16b
|
|
eor v_x2.16b, v_x2.16b, v_x2_high.16b
|
|
eor v_x3.16b, v_x3.16b, v_x3_high.16b
|
|
|
|
eor v_x0.16b, v_x0.16b, v_y0.16b
|
|
eor v_x1.16b, v_x1.16b, v_y1.16b
|
|
eor v_x2.16b, v_x2.16b, v_y2.16b
|
|
eor v_x3.16b, v_x3.16b, v_y3.16b
|
|
bne .clmul_loop
|
|
.endm
|
|
|
|
.macro crc_norm_loop
|
|
.align 3
|
|
.clmul_loop:
|
|
// interleave ldr and pmull(2) for arch which can only issue quadword load every
|
|
// other cycle (i.e. A55)
|
|
ldr q_y0, [x_buf_iter]
|
|
pmull2 v_x0_high.1q, v_x0.2d, v_p4.2d
|
|
ldr q_y1, [x_buf_iter, 16]
|
|
pmull2 v_x1_high.1q, v_x1.2d, v_p4.2d
|
|
ldr q_y2, [x_buf_iter, 32]
|
|
pmull2 v_x2_high.1q, v_x2.2d, v_p4.2d
|
|
ldr q_y3, [x_buf_iter, 48]
|
|
pmull2 v_x3_high.1q, v_x3.2d, v_p4.2d
|
|
|
|
pmull v_x0.1q, v_x0.1d, v_p4.1d
|
|
add x_buf_iter, x_buf_iter, 64
|
|
pmull v_x1.1q, v_x1.1d, v_p4.1d
|
|
cmp x_buf_iter, x_buf_end
|
|
pmull v_x2.1q, v_x2.1d, v_p4.1d
|
|
pmull v_x3.1q, v_x3.1d, v_p4.1d
|
|
|
|
tbl v_y0.16b, {v_y0.16b}, v_shuffle.16b
|
|
tbl v_y1.16b, {v_y1.16b}, v_shuffle.16b
|
|
tbl v_y2.16b, {v_y2.16b}, v_shuffle.16b
|
|
tbl v_y3.16b, {v_y3.16b}, v_shuffle.16b
|
|
|
|
eor v_x0.16b, v_x0.16b, v_x0_high.16b
|
|
eor v_x1.16b, v_x1.16b, v_x1_high.16b
|
|
eor v_x2.16b, v_x2.16b, v_x2_high.16b
|
|
eor v_x3.16b, v_x3.16b, v_x3_high.16b
|
|
|
|
eor v_x0.16b, v_x0.16b, v_y0.16b
|
|
eor v_x1.16b, v_x1.16b, v_y1.16b
|
|
eor v_x2.16b, v_x2.16b, v_y2.16b
|
|
eor v_x3.16b, v_x3.16b, v_y3.16b
|
|
bne .clmul_loop
|
|
.endm
|
|
|
|
.macro crc32_fold_512b_to_128b
|
|
mov x_tmp, p1_low_b0
|
|
movk x_tmp, p1_low_b1, lsl 16
|
|
fmov d_p1_low, x_tmp
|
|
|
|
mov x_tmp2, p1_high_b0
|
|
movk x_tmp2, p1_high_b1, lsl 16
|
|
fmov d_p1_high, x_tmp2
|
|
|
|
pmull2 v_tmp_high.1q, v_x0.2d, v_p1.2d
|
|
pmull v_tmp_low.1q, v_x0.1d, v_p1.1d
|
|
eor v_x1.16b, v_x1.16b, v_tmp_high.16b
|
|
eor v_x1.16b, v_x1.16b, v_tmp_low.16b
|
|
|
|
pmull2 v_tmp_high.1q, v_x1.2d, v_p1.2d
|
|
pmull v_tmp_low.1q, v_x1.1d, v_p1.1d
|
|
eor v_x2.16b, v_x2.16b, v_tmp_high.16b
|
|
eor v_x2.16b, v_x2.16b, v_tmp_low.16b
|
|
|
|
pmull2 v_tmp_high.1q, v_x2.2d, v_p1.2d
|
|
pmull v_tmp_low.1q, v_x2.1d, v_p1.1d
|
|
eor v_x3.16b, v_x3.16b, v_tmp_high.16b
|
|
eor v_x3.16b, v_x3.16b, v_tmp_low.16b
|
|
.endm
|
|
|
|
.macro crc64_fold_512b_to_128b
|
|
mov x_tmp, p1_low_b0
|
|
movk x_tmp, p1_low_b1, lsl 16
|
|
movk x_tmp, p1_low_b2, lsl 32
|
|
movk x_tmp, p1_low_b3, lsl 48
|
|
fmov d_p1_low, x_tmp
|
|
|
|
mov x_tmp2, p1_high_b0
|
|
movk x_tmp2, p1_high_b1, lsl 16
|
|
movk x_tmp2, p1_high_b2, lsl 32
|
|
movk x_tmp2, p1_high_b3, lsl 48
|
|
fmov d_p1_high, x_tmp2
|
|
|
|
pmull2 v_tmp_high.1q, v_x0.2d, v_p1.2d
|
|
pmull v_tmp_low.1q, v_x0.1d, v_p1.1d
|
|
eor v_x1.16b, v_x1.16b, v_tmp_high.16b
|
|
eor v_x1.16b, v_x1.16b, v_tmp_low.16b
|
|
|
|
pmull2 v_tmp_high.1q, v_x1.2d, v_p1.2d
|
|
pmull v_tmp_low.1q, v_x1.1d, v_p1.1d
|
|
eor v_x2.16b, v_x2.16b, v_tmp_high.16b
|
|
eor v_x2.16b, v_x2.16b, v_tmp_low.16b
|
|
|
|
pmull2 v_tmp_high.1q, v_x2.2d, v_p1.2d
|
|
pmull v_tmp_low.1q, v_x2.1d, v_p1.1d
|
|
eor v_x3.16b, v_x3.16b, v_tmp_high.16b
|
|
eor v_x3.16b, v_x3.16b, v_tmp_low.16b
|
|
.endm |