isa-l/crc/aarch64/crc_common_pmull.h
Samuel Lee 4785428d2f crc: arm64 implementation tweaks
+ Utilise `pmull2` instruction in main loops of arm64 crc functions and
avoid the need for `dup` to align multiplicands.
  + Use just 1 ASIMD register to hold both 64b p4 constants,
appropriately aligned.
+ Interleave quadword `ldr` with `pmull{2}` to avoid unnecessary stalls
on existing LITTLE uarch (which can only issue these instructions every
other cycle).
+ Similarly interleave scalar instructions with ASIMD instructions to
increase likelihood of instruction level parallelism on a variety of
uarch.
+ Cut down on needless instructions in non-critical sections to help
performance for small buffers.
+ Extract common instruction sequences into inner macros and moved
them into shared header - crc_common_pmull.h
+ Use the same human readable register aliases and register allocation
in all 4 implementations, never refer to registers without using human
readable alias.
  + Use #defines rather than .req to allow use of same names across
several implementations
+ Reduce tail case size from 1024B to 64B

+ Phrased the `eor` instructions in the main loop to more clearly show
that we can rewrite pairs of `eor` instructions with a single `eor3`
instruction in the presence of Armv8.2-SHA (should probably be an option
in multibinary in future).

Change-Id: I3688193ea4ad88b53cf47e5bd9a7fd5c2b4401e1
Signed-off-by: Samuel Lee <samuel.lee@microsoft.com>
2019-11-13 10:58:19 -07:00

302 lines
8.2 KiB
C

########################################################################
# Copyright (c) 2019 Microsoft Corporation.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Microsoft Corporation nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#########################################################################
// parameters
#define w_seed w0
#define x_seed x0
#define x_buf x1
#define w_len w2
#define x_len x2
// return
#define w_crc_ret w0
#define x_crc_ret x0
// constant
#define FOLD_SIZE 64
// global variables
#define x_buf_end x3
#define w_counter w4
#define x_counter x4
#define x_buf_iter x5
#define x_crc_tab_addr x6
#define x_tmp2 x6
#define w_tmp w7
#define x_tmp x7
#define v_x0 v0
#define d_x0 d0
#define s_x0 s0
#define q_x1 q1
#define v_x1 v1
#define q_x2 q2
#define v_x2 v2
#define q_x3 q3
#define v_x3 v3
#define d_x3 d3
#define s_x3 s3
#define q_y0 q4
#define v_y0 v4
#define v_tmp_high v4
#define d_tmp_high d4
#define q_y1 q5
#define v_y1 v5
#define v_tmp_low v5
#define q_y2 q6
#define v_y2 v6
#define q_y3 q7
#define v_y3 v7
#define q_x0_tmp q30
#define v_x0_tmp v30
#define d_p4_high v30.d[1]
#define d_p4_low d30
#define v_p4 v30
#define d_p1_high v30.d[1]
#define d_p1_low d30
#define v_p1 v30
#define d_p0_high v30.d[1]
#define d_p0_low d30
#define v_p0 v30
#define d_br_low d30
#define d_br_low2 v30.d[1]
#define v_br_low v30
#define q_shuffle q31
#define v_shuffle v31
#define d_br_high d31
#define d_br_high2 v31.d[1]
#define v_br_high v31
#define d_p0_low2 d31
#define d_p0_high2 v31.d[1]
#define v_p02 v31
#define v_x0_high v16
#define v_x1_high v17
#define v_x2_high v18
#define v_x3_high v19
.macro crc_refl_load_first_block
ldr q_x0_tmp, [x_buf]
ldr q_x1, [x_buf, 16]
ldr q_x2, [x_buf, 32]
ldr q_x3, [x_buf, 48]
and x_counter, x_len, -64
sub x_tmp, x_counter, #64
cmp x_tmp, 63
add x_buf_iter, x_buf, 64
eor v_x0.16b, v_x0.16b, v_x0_tmp.16b
.endm
.macro crc_norm_load_first_block
adrp x_tmp, .shuffle_data
ldr q_shuffle, [x_tmp, #:lo12:.shuffle_data]
ldr q_x0_tmp, [x_buf]
ldr q_x1, [x_buf, 16]
ldr q_x2, [x_buf, 32]
ldr q_x3, [x_buf, 48]
and x_counter, x_len, -64
sub x_tmp, x_counter, #64
cmp x_tmp, 63
add x_buf_iter, x_buf, 64
tbl v_x0_tmp.16b, {v_x0_tmp.16b}, v_shuffle.16b
tbl v_x1.16b, {v_x1.16b}, v_shuffle.16b
tbl v_x2.16b, {v_x2.16b}, v_shuffle.16b
tbl v_x3.16b, {v_x3.16b}, v_shuffle.16b
eor v_x0.16b, v_x0.16b, v_x0_tmp.16b
.endm
.macro crc32_load_p4
add x_buf_end, x_buf_iter, x_tmp
mov x_tmp, p4_low_b0
movk x_tmp, p4_low_b1, lsl 16
fmov d_p4_low, x_tmp
mov x_tmp2, p4_high_b0
movk x_tmp2, p4_high_b1, lsl 16
fmov d_p4_high, x_tmp2
.endm
.macro crc64_load_p4
add x_buf_end, x_buf_iter, x_tmp
mov x_tmp, p4_low_b0
movk x_tmp, p4_low_b1, lsl 16
movk x_tmp, p4_low_b2, lsl 32
movk x_tmp, p4_low_b3, lsl 48
fmov d_p4_low, x_tmp
mov x_tmp2, p4_high_b0
movk x_tmp2, p4_high_b1, lsl 16
movk x_tmp2, p4_high_b2, lsl 32
movk x_tmp2, p4_high_b3, lsl 48
fmov d_p4_high, x_tmp2
.endm
.macro crc_refl_loop
.align 3
.clmul_loop:
// interleave ldr and pmull(2) for arch which can only issue quadword load every
// other cycle (i.e. A55)
ldr q_y0, [x_buf_iter]
pmull2 v_x0_high.1q, v_x0.2d, v_p4.2d
ldr q_y1, [x_buf_iter, 16]
pmull2 v_x1_high.1q, v_x1.2d, v_p4.2d
ldr q_y2, [x_buf_iter, 32]
pmull2 v_x2_high.1q, v_x2.2d, v_p4.2d
ldr q_y3, [x_buf_iter, 48]
pmull2 v_x3_high.1q, v_x3.2d, v_p4.2d
pmull v_x0.1q, v_x0.1d, v_p4.1d
add x_buf_iter, x_buf_iter, 64
pmull v_x1.1q, v_x1.1d, v_p4.1d
cmp x_buf_iter, x_buf_end
pmull v_x2.1q, v_x2.1d, v_p4.1d
pmull v_x3.1q, v_x3.1d, v_p4.1d
eor v_x0.16b, v_x0.16b, v_x0_high.16b
eor v_x1.16b, v_x1.16b, v_x1_high.16b
eor v_x2.16b, v_x2.16b, v_x2_high.16b
eor v_x3.16b, v_x3.16b, v_x3_high.16b
eor v_x0.16b, v_x0.16b, v_y0.16b
eor v_x1.16b, v_x1.16b, v_y1.16b
eor v_x2.16b, v_x2.16b, v_y2.16b
eor v_x3.16b, v_x3.16b, v_y3.16b
bne .clmul_loop
.endm
.macro crc_norm_loop
.align 3
.clmul_loop:
// interleave ldr and pmull(2) for arch which can only issue quadword load every
// other cycle (i.e. A55)
ldr q_y0, [x_buf_iter]
pmull2 v_x0_high.1q, v_x0.2d, v_p4.2d
ldr q_y1, [x_buf_iter, 16]
pmull2 v_x1_high.1q, v_x1.2d, v_p4.2d
ldr q_y2, [x_buf_iter, 32]
pmull2 v_x2_high.1q, v_x2.2d, v_p4.2d
ldr q_y3, [x_buf_iter, 48]
pmull2 v_x3_high.1q, v_x3.2d, v_p4.2d
pmull v_x0.1q, v_x0.1d, v_p4.1d
add x_buf_iter, x_buf_iter, 64
pmull v_x1.1q, v_x1.1d, v_p4.1d
cmp x_buf_iter, x_buf_end
pmull v_x2.1q, v_x2.1d, v_p4.1d
pmull v_x3.1q, v_x3.1d, v_p4.1d
tbl v_y0.16b, {v_y0.16b}, v_shuffle.16b
tbl v_y1.16b, {v_y1.16b}, v_shuffle.16b
tbl v_y2.16b, {v_y2.16b}, v_shuffle.16b
tbl v_y3.16b, {v_y3.16b}, v_shuffle.16b
eor v_x0.16b, v_x0.16b, v_x0_high.16b
eor v_x1.16b, v_x1.16b, v_x1_high.16b
eor v_x2.16b, v_x2.16b, v_x2_high.16b
eor v_x3.16b, v_x3.16b, v_x3_high.16b
eor v_x0.16b, v_x0.16b, v_y0.16b
eor v_x1.16b, v_x1.16b, v_y1.16b
eor v_x2.16b, v_x2.16b, v_y2.16b
eor v_x3.16b, v_x3.16b, v_y3.16b
bne .clmul_loop
.endm
.macro crc32_fold_512b_to_128b
mov x_tmp, p1_low_b0
movk x_tmp, p1_low_b1, lsl 16
fmov d_p1_low, x_tmp
mov x_tmp2, p1_high_b0
movk x_tmp2, p1_high_b1, lsl 16
fmov d_p1_high, x_tmp2
pmull2 v_tmp_high.1q, v_x0.2d, v_p1.2d
pmull v_tmp_low.1q, v_x0.1d, v_p1.1d
eor v_x1.16b, v_x1.16b, v_tmp_high.16b
eor v_x1.16b, v_x1.16b, v_tmp_low.16b
pmull2 v_tmp_high.1q, v_x1.2d, v_p1.2d
pmull v_tmp_low.1q, v_x1.1d, v_p1.1d
eor v_x2.16b, v_x2.16b, v_tmp_high.16b
eor v_x2.16b, v_x2.16b, v_tmp_low.16b
pmull2 v_tmp_high.1q, v_x2.2d, v_p1.2d
pmull v_tmp_low.1q, v_x2.1d, v_p1.1d
eor v_x3.16b, v_x3.16b, v_tmp_high.16b
eor v_x3.16b, v_x3.16b, v_tmp_low.16b
.endm
.macro crc64_fold_512b_to_128b
mov x_tmp, p1_low_b0
movk x_tmp, p1_low_b1, lsl 16
movk x_tmp, p1_low_b2, lsl 32
movk x_tmp, p1_low_b3, lsl 48
fmov d_p1_low, x_tmp
mov x_tmp2, p1_high_b0
movk x_tmp2, p1_high_b1, lsl 16
movk x_tmp2, p1_high_b2, lsl 32
movk x_tmp2, p1_high_b3, lsl 48
fmov d_p1_high, x_tmp2
pmull2 v_tmp_high.1q, v_x0.2d, v_p1.2d
pmull v_tmp_low.1q, v_x0.1d, v_p1.1d
eor v_x1.16b, v_x1.16b, v_tmp_high.16b
eor v_x1.16b, v_x1.16b, v_tmp_low.16b
pmull2 v_tmp_high.1q, v_x1.2d, v_p1.2d
pmull v_tmp_low.1q, v_x1.1d, v_p1.1d
eor v_x2.16b, v_x2.16b, v_tmp_high.16b
eor v_x2.16b, v_x2.16b, v_tmp_low.16b
pmull2 v_tmp_high.1q, v_x2.2d, v_p1.2d
pmull v_tmp_low.1q, v_x2.1d, v_p1.1d
eor v_x3.16b, v_x3.16b, v_tmp_high.16b
eor v_x3.16b, v_x3.16b, v_tmp_low.16b
.endm