riscv64: Implement crc16_t10dif_copy

Rather than duplicating all the crc32 4-folding and modifying it to write back to the destination the read-in bytes, write a very simple memcpy that then tail calls crc16_t10dif. This makes the performance of crc16_t10dif_copy much worse than crc16_t10dif, but still about twice as fast as crc16_t10dif_copy_base. Signed-off-by: Daniel Gregory <daniel.gregory@bytedance.com>
2024-12-12 09:23:50 +01:00 · 2024-07-26 16:14:12 +00:00 · 2024-07-26 16:14:12 +00:00 · a62dd046b0
commit a62dd046b0
parent b9e602283f
1 changed files with 24 additions and 0 deletions
--- a/crc/riscv64/crc16_t10dif.S
+++ b/crc/riscv64/crc16_t10dif.S
@ -71,3 +71,27 @@ crc16_t10dif:
 	.dword 0x000000002d560000
 .k6:
 	.dword 0x0000000013680000
+
+
+/* uint16_t crc16_t10dif_copy(uint16_t seed, uint8_t * dst, uint8_t * src, uint64_t len) */
+/* in addition to calculating crc, also copies from src to dst */
+.text
+.align 1
+.global crc16_t10dif_copy
+.type crc16_t10dif_copy, %function
+crc16_t10dif_copy:
+	beqz a3, .memcpy_done
+	add t0, a2, a3
+	mv t1, a2
+.memcpy_loop:
+	lb t2, 0(t1)
+	sb t2, 0(a1)
+	addi t1, t1, 1
+	addi a1, a1, 1
+	bne t1, t0, .memcpy_loop
+
+.memcpy_done:
+	/* tail-call crc function */
+	mv a1, a2
+	mv a2, a3
+	tail crc16_t10dif