riscv64: Implement crc16_t10dif_copy

Rather than duplicating all the crc32 4-folding and modifying it to
write back to the destination the read-in bytes, write a very simple
memcpy that then tail calls crc16_t10dif. This makes the performance of
crc16_t10dif_copy much worse than crc16_t10dif, but still about twice as
fast as crc16_t10dif_copy_base.

Signed-off-by: Daniel Gregory <daniel.gregory@bytedance.com>
This commit is contained in:
Daniel Gregory 2024-07-26 16:14:12 +00:00
parent b9e602283f
commit a62dd046b0

View File

@ -71,3 +71,27 @@ crc16_t10dif:
.dword 0x000000002d560000
.k6:
.dword 0x0000000013680000
/* uint16_t crc16_t10dif_copy(uint16_t seed, uint8_t * dst, uint8_t * src, uint64_t len) */
/* in addition to calculating crc, also copies from src to dst */
.text
.align 1
.global crc16_t10dif_copy
.type crc16_t10dif_copy, %function
crc16_t10dif_copy:
beqz a3, .memcpy_done
add t0, a2, a3
mv t1, a2
.memcpy_loop:
lb t2, 0(t1)
sb t2, 0(a1)
addi t1, t1, 1
addi a1, a1, 1
bne t1, t0, .memcpy_loop
.memcpy_done:
/* tail-call crc function */
mv a1, a2
mv a2, a3
tail crc16_t10dif