mirror of
https://github.com/intel/isa-l.git
synced 2024-12-12 09:23:50 +01:00
4785428d2f
+ Utilise `pmull2` instruction in main loops of arm64 crc functions and avoid the need for `dup` to align multiplicands. + Use just 1 ASIMD register to hold both 64b p4 constants, appropriately aligned. + Interleave quadword `ldr` with `pmull{2}` to avoid unnecessary stalls on existing LITTLE uarch (which can only issue these instructions every other cycle). + Similarly interleave scalar instructions with ASIMD instructions to increase likelihood of instruction level parallelism on a variety of uarch. + Cut down on needless instructions in non-critical sections to help performance for small buffers. + Extract common instruction sequences into inner macros and moved them into shared header - crc_common_pmull.h + Use the same human readable register aliases and register allocation in all 4 implementations, never refer to registers without using human readable alias. + Use #defines rather than .req to allow use of same names across several implementations + Reduce tail case size from 1024B to 64B + Phrased the `eor` instructions in the main loop to more clearly show that we can rewrite pairs of `eor` instructions with a single `eor3` instruction in the presence of Armv8.2-SHA (should probably be an option in multibinary in future). Change-Id: I3688193ea4ad88b53cf47e5bd9a7fd5c2b4401e1 Signed-off-by: Samuel Lee <samuel.lee@microsoft.com> |
||
---|---|---|
.. | ||
aarch64 | ||
crc16_t10dif_01.asm | ||
crc16_t10dif_by4.asm | ||
crc16_t10dif_copy_by4.asm | ||
crc16_t10dif_copy_perf.c | ||
crc16_t10dif_copy_test.c | ||
crc16_t10dif_op_perf.c | ||
crc16_t10dif_perf.c | ||
crc16_t10dif_test.c | ||
crc32_funcs_test.c | ||
crc32_gzip_refl_by8.asm | ||
crc32_gzip_refl_perf.c | ||
crc32_ieee_01.asm | ||
crc32_ieee_by4.asm | ||
crc32_ieee_perf.c | ||
crc32_iscsi_00.asm | ||
crc32_iscsi_01.asm | ||
crc32_iscsi_perf.c | ||
crc64_base.c | ||
crc64_ecma_norm_by8.asm | ||
crc64_ecma_norm_by16_10.asm | ||
crc64_ecma_refl_by8.asm | ||
crc64_ecma_refl_by16_10.asm | ||
crc64_example.c | ||
crc64_funcs_perf.c | ||
crc64_funcs_test.c | ||
crc64_iso_norm_by8.asm | ||
crc64_iso_norm_by16_10.asm | ||
crc64_iso_refl_by8.asm | ||
crc64_iso_refl_by16_10.asm | ||
crc64_jones_norm_by8.asm | ||
crc64_jones_norm_by16_10.asm | ||
crc64_jones_refl_by8.asm | ||
crc64_jones_refl_by16_10.asm | ||
crc64_multibinary.asm | ||
crc64_ref.h | ||
crc_base_aliases.c | ||
crc_base.c | ||
crc_multibinary.asm | ||
crc_ref.h | ||
crc_simple_test.c | ||
Makefile.am |