mirror of
https://github.com/intel/isa-l.git
synced 2024-12-13 09:52:56 +01:00
crc16: Accelerate T10DIF performance with prefetch and pmull2
The memory block size calculated by t10dif is generally 512 bytes in sectors. prefetching can effectively reduce cache misses.Use ldp instead of ldr to reduce the number of instructions, pmull+pmull2 can resuce register access. The perf test result shows that the performance is improved by 5x ~ 14x after optimization. Change-Id: Ibd3f08036b6a45443ffc15f808fd3b467294c283 Signed-off-by: Chunsong Feng <fengchunsong@huawei.com>
This commit is contained in:
parent
ad8dce15c6
commit
e297ecae7a
@ -68,7 +68,7 @@ x_src_saved .req x0
|
||||
x_dst_saved .req x12
|
||||
|
||||
crc16_t10dif_copy_pmull:
|
||||
cmp x_len, 1023
|
||||
cmp x_len, 63
|
||||
sub sp, sp, #16
|
||||
uxth w_seed, w_seed
|
||||
bhi .crc_fold
|
||||
@ -118,6 +118,9 @@ d_x1 .req d3
|
||||
d_x2 .req d4
|
||||
d_x3 .req d5
|
||||
|
||||
q_permutation .req q7
|
||||
v_permutation .req v7
|
||||
|
||||
// the following registers only used this part1
|
||||
d_tmp3 .req d16
|
||||
v_tmp3 .req v16
|
||||
@ -136,18 +139,14 @@ v_tmp3 .req v16
|
||||
add x_src_saved, x_src, 64
|
||||
add x_dst_saved, x_dst, 64
|
||||
|
||||
ldr q_x0, [x_src]
|
||||
ldr q_x1, [x_src, 16]
|
||||
ldr q_x2, [x_src, 32]
|
||||
ldr q_x3, [x_src, 48]
|
||||
ldp q_x0, q_x1, [x_src]
|
||||
ldp q_x2, q_x3, [x_src, 32]
|
||||
|
||||
str q_x0, [x_dst]
|
||||
str q_x1, [x_dst, 16]
|
||||
str q_x2, [x_dst, 32]
|
||||
str q_x3, [x_dst, 48]
|
||||
stp q_x0, q_x1, [x_dst]
|
||||
stp q_x2, q_x3, [x_dst, 32]
|
||||
|
||||
adrp x_tmp, .shuffle_mask_lanchor
|
||||
ldr q7, [x_tmp, :lo12:.shuffle_mask_lanchor]
|
||||
ldr q_permutation, [x_tmp, :lo12:.shuffle_mask_lanchor]
|
||||
|
||||
tbl v_tmp1.16b, {v_x0.16b}, v7.16b
|
||||
eor v_x0.16b, v_tmp3.16b, v_tmp1.16b
|
||||
@ -191,15 +190,10 @@ v_tmp1_x1 .req v25
|
||||
v_tmp1_x2 .req v26
|
||||
v_tmp1_x3 .req v27
|
||||
|
||||
d_p4_h .req d19
|
||||
v_p4_h .req v19
|
||||
d_p4_l .req d17
|
||||
v_p4_l .req v17
|
||||
q_fold_const .req q17
|
||||
v_fold_const .req v17
|
||||
|
||||
mov x_tmp, 0x371d0000 /* p4 [1] */
|
||||
fmov d_p4_h, x_tmp
|
||||
mov x_tmp, 0x87e70000 /* p4 [0] */
|
||||
fmov d_p4_l, x_tmp
|
||||
ldr q_fold_const, =0x371d00000000000087e70000;
|
||||
|
||||
.align 2
|
||||
.crc_fold_loop:
|
||||
@ -209,49 +203,42 @@ v_p4_l .req v17
|
||||
sub x_counter, x_counter, #64
|
||||
cmp x_counter, 63
|
||||
|
||||
dup d_x0_h, v_x0.d[1]
|
||||
dup d_x1_h, v_x1.d[1]
|
||||
dup d_x2_h, v_x2.d[1]
|
||||
dup d_x3_h, v_x3.d[1]
|
||||
ldp q_y0, q_y1, [x_src_saved, -64]
|
||||
ldp q_y2, q_y3, [x_src_saved, -32]
|
||||
|
||||
dup d_x0_l, v_x0.d[0]
|
||||
dup d_x1_l, v_x1.d[0]
|
||||
dup d_x2_l, v_x2.d[0]
|
||||
dup d_x3_l, v_x3.d[0]
|
||||
stp q_y0, q_y1, [x_dst_saved, -64]
|
||||
stp q_y2, q_y3, [x_dst_saved, -32]
|
||||
|
||||
ldr q_y0, [x_src_saved, -64]
|
||||
ldr q_y1, [x_src_saved, -48]
|
||||
ldr q_y2, [x_src_saved, -32]
|
||||
ldr q_y3, [x_src_saved, -16]
|
||||
prfm pldl2strm, [x_src_saved, #1024]
|
||||
prfm pldl2strm, [x_src_saved, #1088]
|
||||
|
||||
str q_y0, [x_dst_saved, -64]
|
||||
str q_y1, [x_dst_saved, -48]
|
||||
str q_y2, [x_dst_saved, -32]
|
||||
str q_y3, [x_dst_saved, -16]
|
||||
pmull2 v_tmp1_x0.1q, v_x0.2d, v_fold_const.2d
|
||||
pmull v_x0.1q, v_x0.1d, v_fold_const.1d
|
||||
|
||||
pmull v_x0_h.1q, v_x0_h.1d, v_p4_h.1d
|
||||
pmull v_x0_l.1q, v_x0_l.1d, v_p4_l.1d
|
||||
pmull v_x1_h.1q, v_x1_h.1d, v_p4_h.1d
|
||||
pmull v_x1_l.1q, v_x1_l.1d, v_p4_l.1d
|
||||
pmull v_x2_h.1q, v_x2_h.1d, v_p4_h.1d
|
||||
pmull v_x2_l.1q, v_x2_l.1d, v_p4_l.1d
|
||||
pmull v_x3_h.1q, v_x3_h.1d, v_p4_h.1d
|
||||
pmull v_x3_l.1q, v_x3_l.1d, v_p4_l.1d
|
||||
pmull2 v_tmp1_x1.1q, v_x1.2d, v_fold_const.2d
|
||||
pmull v_x1.1q, v_x1.1d, v_fold_const.1d
|
||||
|
||||
tbl v_y0.16b, {v_y0.16b}, v7.16b
|
||||
tbl v_y1.16b, {v_y1.16b}, v7.16b
|
||||
tbl v_y2.16b, {v_y2.16b}, v7.16b
|
||||
tbl v_y3.16b, {v_y3.16b}, v7.16b
|
||||
pmull2 v_tmp1_x2.1q, v_x2.2d, v_fold_const.2d
|
||||
pmull v_x2.1q, v_x2.1d, v_fold_const.1d
|
||||
|
||||
eor v_tmp1_x0.16b, v_x0_h.16b, v_x0_l.16b
|
||||
eor v_tmp1_x1.16b, v_x1_h.16b, v_x1_l.16b
|
||||
eor v_tmp1_x2.16b, v_x2_h.16b, v_x2_l.16b
|
||||
eor v_tmp1_x3.16b, v_x3_h.16b, v_x3_l.16b
|
||||
pmull2 v_tmp1_x3.1q, v_x3.2d, v_fold_const.2d
|
||||
pmull v_x3.1q, v_x3.1d, v_fold_const.1d
|
||||
|
||||
eor v_x0.16b, v_tmp1_x0.16b, v_y0.16b
|
||||
eor v_x1.16b, v_tmp1_x1.16b, v_y1.16b
|
||||
eor v_x2.16b, v_tmp1_x2.16b, v_y2.16b
|
||||
eor v_x3.16b, v_tmp1_x3.16b, v_y3.16b
|
||||
tbl v_y0.16b, {v_y0.16b}, v_permutation.16b
|
||||
eor v_x0.16b, v_tmp1_x0.16b, v_x0.16b
|
||||
eor v_x0.16b, v_x0.16b, v_y0.16b
|
||||
|
||||
tbl v_y1.16b, {v_y1.16b}, v_permutation.16b
|
||||
eor v_x1.16b, v_tmp1_x1.16b, v_x1.16b
|
||||
eor v_x1.16b, v_x1.16b, v_y1.16b
|
||||
|
||||
tbl v_y2.16b, {v_y2.16b}, v_permutation.16b
|
||||
eor v_x2.16b, v_tmp1_x2.16b, v_x2.16b
|
||||
eor v_x2.16b, v_x2.16b, v_y2.16b
|
||||
|
||||
tbl v_y3.16b, {v_y3.16b}, v_permutation.16b
|
||||
eor v_x3.16b, v_tmp1_x3.16b, v_x3.16b
|
||||
eor v_x3.16b, v_x3.16b, v_y3.16b
|
||||
|
||||
bhi .crc_fold_loop
|
||||
|
||||
|
@ -66,7 +66,7 @@ x_crc16tab .req x4
|
||||
x_buf_saved .req x0
|
||||
|
||||
crc16_t10dif_pmull:
|
||||
cmp x_len, 1023
|
||||
cmp x_len, 63
|
||||
sub sp, sp, #16
|
||||
uxth w_seed, w_seed
|
||||
bhi .crc_fold
|
||||
@ -114,6 +114,9 @@ d_x1 .req d3
|
||||
d_x2 .req d4
|
||||
d_x3 .req d5
|
||||
|
||||
q_permutation .req q7
|
||||
v_permutation .req v7
|
||||
|
||||
// the following registers only used this part1
|
||||
d_tmp3 .req d16
|
||||
v_tmp3 .req v16
|
||||
@ -131,10 +134,8 @@ v_tmp3 .req v16
|
||||
cmp x_counter, 63
|
||||
add x_buf_saved, x_buf, 64
|
||||
|
||||
ldr q_x0, [x_buf]
|
||||
ldr q_x1, [x_buf, 16]
|
||||
ldr q_x2, [x_buf, 32]
|
||||
ldr q_x3, [x_buf, 48]
|
||||
ldp q_x0, q_x1, [x_buf]
|
||||
ldp q_x2, q_x3, [x_buf, 32]
|
||||
|
||||
adrp x_tmp, .shuffle_mask_lanchor
|
||||
ldr q7, [x_tmp, :lo12:.shuffle_mask_lanchor]
|
||||
@ -181,15 +182,10 @@ v_tmp1_x1 .req v25
|
||||
v_tmp1_x2 .req v26
|
||||
v_tmp1_x3 .req v27
|
||||
|
||||
d_p4_h .req d19
|
||||
v_p4_h .req v19
|
||||
d_p4_l .req d17
|
||||
v_p4_l .req v17
|
||||
q_fold_const .req q17
|
||||
v_fold_const .req v17
|
||||
|
||||
mov x_tmp, 0x371d0000 /* p4 [1] */
|
||||
fmov d_p4_h, x_tmp
|
||||
mov x_tmp, 0x87e70000 /* p4 [0] */
|
||||
fmov d_p4_l, x_tmp
|
||||
ldr q_fold_const, =0x371d00000000000087e70000;
|
||||
|
||||
.align 2
|
||||
.crc_fold_loop:
|
||||
@ -197,44 +193,39 @@ v_p4_l .req v17
|
||||
sub x_counter, x_counter, #64
|
||||
cmp x_counter, 63
|
||||
|
||||
dup d_x0_h, v_x0.d[1]
|
||||
dup d_x1_h, v_x1.d[1]
|
||||
dup d_x2_h, v_x2.d[1]
|
||||
dup d_x3_h, v_x3.d[1]
|
||||
ldp q_y0, q_y1, [x_buf_saved, -64]
|
||||
ldp q_y2, q_y3, [x_buf_saved, -32]
|
||||
|
||||
dup d_x0_l, v_x0.d[0]
|
||||
dup d_x1_l, v_x1.d[0]
|
||||
dup d_x2_l, v_x2.d[0]
|
||||
dup d_x3_l, v_x3.d[0]
|
||||
prfm pldl2strm, [x_buf_saved, #1024]
|
||||
prfm pldl2strm, [x_buf_saved, #1088]
|
||||
|
||||
ldr q_y0, [x_buf_saved, -64]
|
||||
ldr q_y1, [x_buf_saved, -48]
|
||||
ldr q_y2, [x_buf_saved, -32]
|
||||
ldr q_y3, [x_buf_saved, -16]
|
||||
pmull2 v_tmp1_x0.1q, v_x0.2d, v_fold_const.2d
|
||||
pmull v_x0.1q, v_x0.1d, v_fold_const.1d
|
||||
|
||||
pmull v_x0_h.1q, v_x0_h.1d, v_p4_h.1d
|
||||
pmull v_x0_l.1q, v_x0_l.1d, v_p4_l.1d
|
||||
pmull v_x1_h.1q, v_x1_h.1d, v_p4_h.1d
|
||||
pmull v_x1_l.1q, v_x1_l.1d, v_p4_l.1d
|
||||
pmull v_x2_h.1q, v_x2_h.1d, v_p4_h.1d
|
||||
pmull v_x2_l.1q, v_x2_l.1d, v_p4_l.1d
|
||||
pmull v_x3_h.1q, v_x3_h.1d, v_p4_h.1d
|
||||
pmull v_x3_l.1q, v_x3_l.1d, v_p4_l.1d
|
||||
pmull2 v_tmp1_x1.1q, v_x1.2d, v_fold_const.2d
|
||||
pmull v_x1.1q, v_x1.1d, v_fold_const.1d
|
||||
|
||||
tbl v_y0.16b, {v_y0.16b}, v7.16b
|
||||
tbl v_y1.16b, {v_y1.16b}, v7.16b
|
||||
tbl v_y2.16b, {v_y2.16b}, v7.16b
|
||||
tbl v_y3.16b, {v_y3.16b}, v7.16b
|
||||
pmull2 v_tmp1_x2.1q, v_x2.2d, v_fold_const.2d
|
||||
pmull v_x2.1q, v_x2.1d, v_fold_const.1d
|
||||
|
||||
eor v_tmp1_x0.16b, v_x0_h.16b, v_x0_l.16b
|
||||
eor v_tmp1_x1.16b, v_x1_h.16b, v_x1_l.16b
|
||||
eor v_tmp1_x2.16b, v_x2_h.16b, v_x2_l.16b
|
||||
eor v_tmp1_x3.16b, v_x3_h.16b, v_x3_l.16b
|
||||
pmull2 v_tmp1_x3.1q, v_x3.2d, v_fold_const.2d
|
||||
pmull v_x3.1q, v_x3.1d, v_fold_const.1d
|
||||
|
||||
eor v_x0.16b, v_tmp1_x0.16b, v_y0.16b
|
||||
eor v_x1.16b, v_tmp1_x1.16b, v_y1.16b
|
||||
eor v_x2.16b, v_tmp1_x2.16b, v_y2.16b
|
||||
eor v_x3.16b, v_tmp1_x3.16b, v_y3.16b
|
||||
tbl v_y0.16b, {v_y0.16b}, v_permutation.16b
|
||||
eor v_x0.16b, v_tmp1_x0.16b, v_x0.16b
|
||||
eor v_x0.16b, v_x0.16b, v_y0.16b
|
||||
|
||||
tbl v_y1.16b, {v_y1.16b}, v_permutation.16b
|
||||
eor v_x1.16b, v_tmp1_x1.16b, v_x1.16b
|
||||
eor v_x1.16b, v_x1.16b, v_y1.16b
|
||||
|
||||
tbl v_y2.16b, {v_y2.16b}, v_permutation.16b
|
||||
eor v_x2.16b, v_tmp1_x2.16b, v_x2.16b
|
||||
eor v_x2.16b, v_x2.16b, v_y2.16b
|
||||
|
||||
tbl v_y3.16b, {v_y3.16b}, v_permutation.16b
|
||||
eor v_x3.16b, v_tmp1_x3.16b, v_x3.16b
|
||||
eor v_x3.16b, v_x3.16b, v_y3.16b
|
||||
|
||||
bhi .crc_fold_loop
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user