mirror of
https://github.com/intel/isa-l.git
synced 2024-12-12 09:23:50 +01:00
erasure_code: implement EC with AVX512 + GFNI
Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
This commit is contained in:
parent
9d487fd6db
commit
1eff12dddb
@ -91,6 +91,12 @@ objs = \
|
||||
bin\gf_4vect_dot_prod_avx512.obj \
|
||||
bin\gf_5vect_dot_prod_avx512.obj \
|
||||
bin\gf_6vect_dot_prod_avx512.obj \
|
||||
bin\gf_vect_dot_prod_avx512_gfni.obj \
|
||||
bin\gf_2vect_dot_prod_avx512_gfni.obj \
|
||||
bin\gf_3vect_dot_prod_avx512_gfni.obj \
|
||||
bin\gf_4vect_dot_prod_avx512_gfni.obj \
|
||||
bin\gf_5vect_dot_prod_avx512_gfni.obj \
|
||||
bin\gf_6vect_dot_prod_avx512_gfni.obj \
|
||||
bin\gf_vect_mad_avx512.obj \
|
||||
bin\gf_2vect_mad_avx512.obj \
|
||||
bin\gf_3vect_mad_avx512.obj \
|
||||
|
@ -84,6 +84,12 @@ lsrc_x86_64 += \
|
||||
erasure_code/gf_4vect_dot_prod_avx512.asm \
|
||||
erasure_code/gf_5vect_dot_prod_avx512.asm \
|
||||
erasure_code/gf_6vect_dot_prod_avx512.asm \
|
||||
erasure_code/gf_vect_dot_prod_avx512_gfni.asm \
|
||||
erasure_code/gf_2vect_dot_prod_avx512_gfni.asm \
|
||||
erasure_code/gf_3vect_dot_prod_avx512_gfni.asm \
|
||||
erasure_code/gf_4vect_dot_prod_avx512_gfni.asm \
|
||||
erasure_code/gf_5vect_dot_prod_avx512_gfni.asm \
|
||||
erasure_code/gf_6vect_dot_prod_avx512_gfni.asm \
|
||||
erasure_code/gf_vect_mad_avx512.asm \
|
||||
erasure_code/gf_2vect_mad_avx512.asm \
|
||||
erasure_code/gf_3vect_mad_avx512.asm \
|
||||
|
@ -29,10 +29,12 @@
|
||||
|
||||
#include <limits.h>
|
||||
#include <string.h> // for memset
|
||||
#include <stdint.h>
|
||||
|
||||
#include "erasure_code.h"
|
||||
#include "ec_base.h" // for GF tables
|
||||
|
||||
void ec_init_tables(int k, int rows, unsigned char *a, unsigned char *g_tbls)
|
||||
void ec_init_tables_base(int k, int rows, unsigned char *a, unsigned char *g_tbls)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
|
@ -30,6 +30,77 @@
|
||||
#ifndef _EC_BASE_H_
|
||||
#define _EC_BASE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define MAX_NUM_OUTPUTS_CALL 6
|
||||
|
||||
static const uint64_t gf_table_gfni[256] = {
|
||||
0x0000000000000000, 0x102040810204080, 0x8001828488102040, 0x8103868c983060c0,
|
||||
0x408041c2c4881020, 0x418245cad4a850a0, 0xc081c3464c983060, 0xc183c74e5cb870e0,
|
||||
0x2040a061e2c48810, 0x2142a469f2e4c890, 0xa04122e56ad4a850, 0xa14326ed7af4e8d0,
|
||||
0x60c0e1a3264c9830, 0x61c2e5ab366cd8b0, 0xe0c16327ae5cb870, 0xe1c3672fbe7cf8f0,
|
||||
0x102050b071e2c488, 0x112254b861c28408, 0x9021d234f9f2e4c8, 0x9123d63ce9d2a448,
|
||||
0x50a01172b56ad4a8, 0x51a2157aa54a9428, 0xd0a193f63d7af4e8, 0xd1a397fe2d5ab468,
|
||||
0x3060f0d193264c98, 0x3162f4d983060c18, 0xb06172551b366cd8, 0xb163765d0b162c58,
|
||||
0x70e0b11357ae5cb8, 0x71e2b51b478e1c38, 0xf0e13397dfbe7cf8, 0xf1e3379fcf9e3c78,
|
||||
0x8810a8d83871e2c4, 0x8912acd02851a244, 0x08112a5cb061c284, 0x09132e54a0418204,
|
||||
0xc890e91afcf9f2e4, 0xc992ed12ecd9b264, 0x48916b9e74e9d2a4, 0x49936f9664c99224,
|
||||
0xa85008b9dab56ad4, 0xa9520cb1ca952a54, 0x28518a3d52a54a94, 0x29538e3542850a14,
|
||||
0xe8d0497b1e3d7af4, 0xe9d24d730e1d3a74, 0x68d1cbff962d5ab4, 0x69d3cff7860d1a34,
|
||||
0x9830f8684993264c, 0x9932fc6059b366cc, 0x18317aecc183060c, 0x19337ee4d1a3468c,
|
||||
0xd8b0b9aa8d1b366c, 0xd9b2bda29d3b76ec, 0x58b13b2e050b162c, 0x59b33f26152b56ac,
|
||||
0xb8705809ab57ae5c, 0xb9725c01bb77eedc, 0x3871da8d23478e1c, 0x3973de853367ce9c,
|
||||
0xf8f019cb6fdfbe7c, 0xf9f21dc37ffffefc, 0x78f19b4fe7cf9e3c, 0x79f39f47f7efdebc,
|
||||
0xc488d46c1c3871e2, 0xc58ad0640c183162, 0x448956e8942851a2, 0x458b52e084081122,
|
||||
0x840895aed8b061c2, 0x850a91a6c8902142, 0x0409172a50a04182, 0x050b132240800102,
|
||||
0xe4c8740dfefcf9f2, 0xe5ca7005eedcb972, 0x64c9f68976ecd9b2, 0x65cbf28166cc9932,
|
||||
0xa44835cf3a74e9d2, 0xa54a31c72a54a952, 0x2449b74bb264c992, 0x254bb343a2448912,
|
||||
0xd4a884dc6ddab56a, 0xd5aa80d47dfaf5ea, 0x54a90658e5ca952a, 0x55ab0250f5ead5aa,
|
||||
0x9428c51ea952a54a, 0x952ac116b972e5ca, 0x1429479a2142850a, 0x152b43923162c58a,
|
||||
0xf4e824bd8f1e3d7a, 0xf5ea20b59f3e7dfa, 0x74e9a639070e1d3a, 0x75eba231172e5dba,
|
||||
0xb468657f4b962d5a, 0xb56a61775bb66dda, 0x3469e7fbc3860d1a, 0x356be3f3d3a64d9a,
|
||||
0x4c987cb424499326, 0x4d9a78bc3469d3a6, 0xcc99fe30ac59b366, 0xcd9bfa38bc79f3e6,
|
||||
0x0c183d76e0c18306, 0x0d1a397ef0e1c386, 0x8c19bff268d1a346, 0x8d1bbbfa78f1e3c6,
|
||||
0x6cd8dcd5c68d1b36, 0x6ddad8ddd6ad5bb6, 0xecd95e514e9d3b76, 0xeddb5a595ebd7bf6,
|
||||
0x2c589d1702050b16, 0x2d5a991f12254b96, 0xac591f938a152b56, 0xad5b1b9b9a356bd6,
|
||||
0x5cb82c0455ab57ae, 0x5dba280c458b172e, 0xdcb9ae80ddbb77ee, 0xddbbaa88cd9b376e,
|
||||
0x1c386dc69123478e, 0x1d3a69ce8103070e, 0x9c39ef42193367ce, 0x9d3beb4a0913274e,
|
||||
0x7cf88c65b76fdfbe, 0x7dfa886da74f9f3e, 0xfcf90ee13f7ffffe, 0xfdfb0ae92f5fbf7e,
|
||||
0x3c78cda773e7cf9e, 0x3d7ac9af63c78f1e, 0xbc794f23fbf7efde, 0xbd7b4b2bebd7af5e,
|
||||
0xe2c46a368e1c3871, 0xe3c66e3e9e3c78f1, 0x62c5e8b2060c1831, 0x63c7ecba162c58b1,
|
||||
0xa2442bf44a942851, 0xa3462ffc5ab468d1, 0x2245a970c2840811, 0x2347ad78d2a44891,
|
||||
0xc284ca576cd8b061, 0xc386ce5f7cf8f0e1, 0x428548d3e4c89021, 0x43874cdbf4e8d0a1,
|
||||
0x82048b95a850a041, 0x83068f9db870e0c1, 0x0205091120408001, 0x03070d193060c081,
|
||||
0xf2e43a86fffefcf9, 0xf3e63e8eefdebc79, 0x72e5b80277eedcb9, 0x73e7bc0a67ce9c39,
|
||||
0xb2647b443b76ecd9, 0xb3667f4c2b56ac59, 0x3265f9c0b366cc99, 0x3367fdc8a3468c19,
|
||||
0xd2a49ae71d3a74e9, 0xd3a69eef0d1a3469, 0x52a51863952a54a9, 0x53a71c6b850a1429,
|
||||
0x9224db25d9b264c9, 0x9326df2dc9922449, 0x122559a151a24489, 0x13275da941820409,
|
||||
0x6ad4c2eeb66ddab5, 0x6bd6c6e6a64d9a35, 0xead5406a3e7dfaf5, 0xebd744622e5dba75,
|
||||
0x2a54832c72e5ca95, 0x2b56872462c58a15, 0xaa5501a8faf5ead5, 0xab5705a0ead5aa55,
|
||||
0x4a94628f54a952a5, 0x4b96668744891225, 0xca95e00bdcb972e5, 0xcb97e403cc993265,
|
||||
0x0a14234d90214285, 0x0b16274580010205, 0x8a15a1c9183162c5, 0x8b17a5c108112245,
|
||||
0x7af4925ec78f1e3d, 0x7bf69656d7af5ebd, 0xfaf510da4f9f3e7d, 0xfbf714d25fbf7efd,
|
||||
0x3a74d39c03070e1d, 0x3b76d79413274e9d, 0xba7551188b172e5d, 0xbb7755109b376edd,
|
||||
0x5ab4323f254b962d, 0x5bb63637356bd6ad, 0xdab5b0bbad5bb66d, 0xdbb7b4b3bd7bf6ed,
|
||||
0x1a3473fde1c3860d, 0x1b3677f5f1e3c68d, 0x9a35f17969d3a64d, 0x9b37f57179f3e6cd,
|
||||
0x264cbe5a92244993, 0x274eba5282040913, 0xa64d3cde1a3469d3, 0xa74f38d60a142953,
|
||||
0x66ccff9856ac59b3, 0x67cefb90468c1933, 0xe6cd7d1cdebc79f3, 0xe7cf7914ce9c3973,
|
||||
0x060c1e3b70e0c183, 0x070e1a3360c08103, 0x860d9cbff8f0e1c3, 0x870f98b7e8d0a143,
|
||||
0x468c5ff9b468d1a3, 0x478e5bf1a4489123, 0xc68ddd7d3c78f1e3, 0xc78fd9752c58b163,
|
||||
0x366ceeeae3c68d1b, 0x376eeae2f3e6cd9b, 0xb66d6c6e6bd6ad5b, 0xb76f68667bf6eddb,
|
||||
0x76ecaf28274e9d3b, 0x77eeab20376eddbb, 0xf6ed2dacaf5ebd7b, 0xf7ef29a4bf7efdfb,
|
||||
0x162c4e8b0102050b, 0x172e4a831122458b, 0x962dcc0f8912254b, 0x972fc807993265cb,
|
||||
0x56ac0f49c58a152b, 0x57ae0b41d5aa55ab, 0xd6ad8dcd4d9a356b, 0xd7af89c55dba75eb,
|
||||
0xae5c1682aa55ab57, 0xaf5e128aba75ebd7, 0x2e5d940622458b17, 0x2f5f900e3265cb97,
|
||||
0xeedc57406eddbb77, 0xefde53487efdfbf7, 0x6eddd5c4e6cd9b37, 0x6fdfd1ccf6eddbb7,
|
||||
0x8e1cb6e348912347, 0x8f1eb2eb58b163c7, 0x0e1d3467c0810307, 0x0f1f306fd0a14387,
|
||||
0xce9cf7218c193367, 0xcf9ef3299c3973e7, 0x4e9d75a504091327, 0x4f9f71ad142953a7,
|
||||
0xbe7c4632dbb76fdf, 0xbf7e423acb972f5f, 0x3e7dc4b653a74f9f, 0x3f7fc0be43870f1f,
|
||||
0xfefc07f01f3f7fff, 0xfffe03f80f1f3f7f, 0x7efd8574972f5fbf, 0x7fff817c870f1f3f,
|
||||
0x9e3ce6533973e7cf, 0x9f3ee25b2953a74f, 0x1e3d64d7b163c78f, 0x1f3f60dfa143870f,
|
||||
0xdebca791fdfbf7ef, 0xdfbea399eddbb76f, 0x5ebd251575ebd7af, 0x5fbf211d65cb972f
|
||||
};
|
||||
|
||||
// Global GF(256) tables
|
||||
#ifndef GF_LARGE_TABLES
|
||||
static const unsigned char gff_base[] = {
|
||||
|
@ -28,6 +28,7 @@
|
||||
**********************************************************************/
|
||||
#include <limits.h>
|
||||
#include "erasure_code.h"
|
||||
#include "ec_base.h" /* for GF tables */
|
||||
|
||||
#if __x86_64__ || __i386__ || _M_X64 || _M_IX86
|
||||
void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
|
||||
@ -238,6 +239,66 @@ void ec_encode_data_update_avx512(int len, int k, int rows, int vec_i, unsigned
|
||||
}
|
||||
}
|
||||
|
||||
#if AS_FEATURE_LEVEL >= 10
|
||||
|
||||
extern void gf_vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls,
|
||||
unsigned char **data, unsigned char *dest);
|
||||
extern void gf_2vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls,
|
||||
unsigned char **data, unsigned char **coding);
|
||||
extern void gf_3vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls,
|
||||
unsigned char **data, unsigned char **coding);
|
||||
extern void gf_4vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls,
|
||||
unsigned char **data, unsigned char **coding);
|
||||
extern void gf_5vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls,
|
||||
unsigned char **data, unsigned char **coding);
|
||||
extern void gf_6vect_dot_prod_avx512_gfni(int len, int k, unsigned char *g_tbls,
|
||||
unsigned char **data, unsigned char **coding);
|
||||
|
||||
void ec_init_tables_gfni(int k, int rows, unsigned char *a, unsigned char *g_tbls)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
uint64_t *g64 = (uint64_t *) g_tbls;
|
||||
|
||||
for (i = 0; i < rows; i++)
|
||||
for (j = 0; j < k; j++)
|
||||
*(g64++) = gf_table_gfni[*a++];
|
||||
|
||||
}
|
||||
|
||||
void ec_encode_data_avx512_gfni(int len, int k, int rows, unsigned char *g_tbls,
|
||||
unsigned char **data, unsigned char **coding)
|
||||
{
|
||||
|
||||
while (rows >= 6) {
|
||||
gf_6vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding);
|
||||
g_tbls += 6 * k * 8;
|
||||
coding += 6;
|
||||
rows -= 6;
|
||||
}
|
||||
switch (rows) {
|
||||
case 5:
|
||||
gf_5vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 4:
|
||||
gf_4vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 3:
|
||||
gf_3vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 2:
|
||||
gf_2vect_dot_prod_avx512_gfni(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 1:
|
||||
gf_vect_dot_prod_avx512_gfni(len, k, g_tbls, data, *coding);
|
||||
break;
|
||||
case 0:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // AS_FEATURE_LEVEL >= 10
|
||||
#endif // HAVE_AS_KNOWS_AVX512
|
||||
|
||||
#if __WORDSIZE == 64 || _WIN64 || __x86_64__
|
||||
|
@ -53,6 +53,13 @@
|
||||
extern gf_vect_mad_avx2
|
||||
%endif
|
||||
|
||||
%if (AS_FEATURE_LEVEL) >= 10
|
||||
extern ec_init_tables_gfni
|
||||
extern ec_encode_data_avx512_gfni
|
||||
%endif
|
||||
|
||||
extern ec_init_tables_base
|
||||
|
||||
extern gf_vect_mul_base
|
||||
extern ec_encode_data_base
|
||||
extern ec_encode_data_update_base
|
||||
@ -71,6 +78,7 @@ mbin_interface gf_vect_dot_prod
|
||||
mbin_interface gf_vect_mul
|
||||
mbin_interface ec_encode_data_update
|
||||
mbin_interface gf_vect_mad
|
||||
mbin_interface ec_init_tables
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
mbin_dispatch_init5 ec_encode_data, ec_encode_data_base, ec_encode_data_sse, ec_encode_data_avx, ec_encode_data_avx2
|
||||
@ -78,11 +86,13 @@ mbin_interface gf_vect_mad
|
||||
mbin_dispatch_init2 gf_vect_mul, gf_vect_mul_base
|
||||
mbin_dispatch_init2 ec_encode_data_update, ec_encode_data_update_base
|
||||
mbin_dispatch_init2 gf_vect_mad, gf_vect_mad_base
|
||||
mbin_dispatch_init2 ec_init_tables, ec_init_tables_base
|
||||
%else
|
||||
|
||||
mbin_dispatch_init5 gf_vect_mul, gf_vect_mul_base, gf_vect_mul_sse, gf_vect_mul_avx, gf_vect_mul_avx
|
||||
mbin_dispatch_init6 ec_encode_data, ec_encode_data_base, ec_encode_data_sse, ec_encode_data_avx, ec_encode_data_avx2, ec_encode_data_avx512
|
||||
mbin_dispatch_init7 ec_encode_data, ec_encode_data_base, ec_encode_data_sse, ec_encode_data_avx, ec_encode_data_avx2, ec_encode_data_avx512, ec_encode_data_avx512_gfni
|
||||
mbin_dispatch_init6 ec_encode_data_update, ec_encode_data_update_base, ec_encode_data_update_sse, ec_encode_data_update_avx, ec_encode_data_update_avx2, ec_encode_data_update_avx512
|
||||
mbin_dispatch_init6 gf_vect_mad, gf_vect_mad_base, gf_vect_mad_sse, gf_vect_mad_avx, gf_vect_mad_avx2, gf_vect_mad_avx512
|
||||
mbin_dispatch_init6 gf_vect_dot_prod, gf_vect_dot_prod_base, gf_vect_dot_prod_sse, gf_vect_dot_prod_avx, gf_vect_dot_prod_avx2, gf_vect_dot_prod_avx512
|
||||
mbin_dispatch_init7 ec_init_tables, ec_init_tables_base, ec_init_tables_base, ec_init_tables_base, ec_init_tables_base, ec_init_tables_base, ec_init_tables_gfni
|
||||
%endif
|
||||
|
209
erasure_code/gf_2vect_dot_prod_avx512_gfni.asm
Normal file
209
erasure_code/gf_2vect_dot_prod_avx512_gfni.asm
Normal file
@ -0,0 +1,209 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2023 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_2vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
%include "gf_vect_gfni.inc"
|
||||
|
||||
%if AS_FEATURE_LEVEL >= 10
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r12 ; must be saved and restored
|
||||
|
||||
%define func(x) x: endbranch
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r14 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define stack_size 3*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + 8 + 8*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
mov [rsp + 0*8], r12
|
||||
mov [rsp + 1*8], r13
|
||||
mov [rsp + 2*8], r14
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
mov r12, [rsp + 0*8]
|
||||
mov r13, [rsp + 1*8]
|
||||
mov r14, [rsp + 2*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest2 tmp3
|
||||
%define pos rax
|
||||
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu8
|
||||
%define XSTR vmovdqu8
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa64
|
||||
%define XSTR vmovdqa64
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%define xgft1 zmm3
|
||||
%define xgft2 zmm4
|
||||
|
||||
%define x0 zmm0
|
||||
%define xp1 zmm1
|
||||
%define xp2 zmm2
|
||||
|
||||
default rel
|
||||
[bits 64]
|
||||
|
||||
section .text
|
||||
|
||||
;;
|
||||
;; Encodes 64 bytes of all "k" sources into 2x 64 bytes (parity disks)
|
||||
;;
|
||||
%macro ENCODE_64B_2 0-1
|
||||
%define %%KMASK %1
|
||||
|
||||
vpxorq xp1, xp1, xp1
|
||||
vpxorq xp2, xp2, xp2
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
%%next_vect:
|
||||
mov ptr, [src + vec_i]
|
||||
%if %0 == 1
|
||||
vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes)
|
||||
%else
|
||||
XLDR x0, [ptr + pos] ;Get next source vector (64 bytes)
|
||||
%endif
|
||||
add vec_i, 8
|
||||
|
||||
vbroadcastf32x2 xgft1, [tmp]
|
||||
vbroadcastf32x2 xgft2, [tmp + vec]
|
||||
add tmp, 8
|
||||
|
||||
GF_MUL_XOR x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2
|
||||
|
||||
cmp vec_i, vec
|
||||
jl %%next_vect
|
||||
|
||||
%if %0 == 1
|
||||
vmovdqu8 [dest1 + pos]{%%KMASK}, xp1
|
||||
vmovdqu8 [dest2 + pos]{%%KMASK}, xp2
|
||||
%else
|
||||
XSTR [dest1 + pos], xp1
|
||||
XSTR [dest2 + pos], xp2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
align 16
|
||||
mk_global gf_2vect_dot_prod_avx512_gfni, function
|
||||
func(gf_2vect_dot_prod_avx512_gfni)
|
||||
FUNC_SAVE
|
||||
|
||||
xor pos, pos
|
||||
shl vec, 3 ;vec *= 8. Make vec_i count by 8
|
||||
mov dest2, [dest1 + 8]
|
||||
mov dest1, [dest1]
|
||||
|
||||
cmp len, 64
|
||||
jb .len_lt_64
|
||||
|
||||
.loop64:
|
||||
|
||||
ENCODE_64B_2
|
||||
|
||||
add pos, 64 ;Loop on 64 bytes at a time
|
||||
sub len, 64
|
||||
cmp len, 64
|
||||
jge .loop64
|
||||
|
||||
.len_lt_64:
|
||||
cmp len, 0
|
||||
jle .exit
|
||||
|
||||
xor tmp, tmp
|
||||
bts tmp, len
|
||||
dec tmp
|
||||
kmovq k1, tmp
|
||||
|
||||
ENCODE_64B_2 k1
|
||||
|
||||
.exit:
|
||||
vzeroupper
|
||||
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
%endif ; if AS_FEATURE_LEVEL >= 10
|
225
erasure_code/gf_3vect_dot_prod_avx512_gfni.asm
Normal file
225
erasure_code/gf_3vect_dot_prod_avx512_gfni.asm
Normal file
@ -0,0 +1,225 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2023 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_3vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
%include "gf_vect_gfni.inc"
|
||||
|
||||
%if AS_FEATURE_LEVEL >= 10
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
|
||||
%define func(x) x: endbranch
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define stack_size 1*16 + 5*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + 8 + 8*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
vmovdqa [rsp + 0*16], xmm6
|
||||
mov [rsp + 1*16 + 0*8], r12
|
||||
mov [rsp + 1*16 + 1*8], r13
|
||||
mov [rsp + 1*16 + 2*8], r14
|
||||
mov [rsp + 1*16 + 3*8], r15
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
mov r12, [rsp + 1*16 + 0*8]
|
||||
mov r13, [rsp + 1*16 + 1*8]
|
||||
mov r14, [rsp + 1*16 + 2*8]
|
||||
mov r15, [rsp + 1*16 + 3*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest2 tmp3
|
||||
%define dest3 tmp4
|
||||
%define pos rax
|
||||
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu8
|
||||
%define XSTR vmovdqu8
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa64
|
||||
%define XSTR vmovdqa64
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%define xgft1 zmm4
|
||||
%define xgft2 zmm5
|
||||
%define xgft3 zmm6
|
||||
|
||||
%define x0 zmm0
|
||||
%define xp1 zmm1
|
||||
%define xp2 zmm2
|
||||
%define xp3 zmm3
|
||||
|
||||
default rel
|
||||
[bits 64]
|
||||
|
||||
section .text
|
||||
|
||||
;;
|
||||
;; Encodes 64 bytes of all "k" sources into 3x 64 bytes (parity disks)
|
||||
;;
|
||||
%macro ENCODE_64B_3 0-1
|
||||
%define %%KMASK %1
|
||||
|
||||
vpxorq xp1, xp1, xp1
|
||||
vpxorq xp2, xp2, xp2
|
||||
vpxorq xp3, xp3, xp3
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
%%next_vect:
|
||||
mov ptr, [src + vec_i]
|
||||
%if %0 == 1
|
||||
vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes)
|
||||
%else
|
||||
XLDR x0, [ptr + pos] ;Get next source vector (64 bytes)
|
||||
%endif
|
||||
add vec_i, 8
|
||||
|
||||
vbroadcastf32x2 xgft1, [tmp]
|
||||
vbroadcastf32x2 xgft2, [tmp + vec]
|
||||
vbroadcastf32x2 xgft3, [tmp + vec*2]
|
||||
add tmp, 8
|
||||
|
||||
GF_MUL_XOR x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3
|
||||
|
||||
cmp vec_i, vec
|
||||
jl %%next_vect
|
||||
|
||||
%if %0 == 1
|
||||
vmovdqu8 [dest1 + pos]{%%KMASK}, xp1
|
||||
vmovdqu8 [dest2 + pos]{%%KMASK}, xp2
|
||||
vmovdqu8 [dest3 + pos]{%%KMASK}, xp3
|
||||
%else
|
||||
XSTR [dest1 + pos], xp1
|
||||
XSTR [dest2 + pos], xp2
|
||||
XSTR [dest3 + pos], xp3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
align 16
|
||||
mk_global gf_3vect_dot_prod_avx512_gfni, function
|
||||
func(gf_3vect_dot_prod_avx512_gfni)
|
||||
FUNC_SAVE
|
||||
|
||||
xor pos, pos
|
||||
shl vec, 3 ;vec *= 8. Make vec_i count by 8
|
||||
mov dest2, [dest1 + 8]
|
||||
mov dest3, [dest1 + 2*8]
|
||||
mov dest1, [dest1]
|
||||
|
||||
cmp len, 64
|
||||
jb .len_lt_64
|
||||
|
||||
.loop64:
|
||||
|
||||
ENCODE_64B_3
|
||||
|
||||
add pos, 64 ;Loop on 64 bytes at a time
|
||||
sub len, 64
|
||||
cmp len, 64
|
||||
jge .loop64
|
||||
|
||||
.len_lt_64:
|
||||
cmp len, 0
|
||||
jle .exit
|
||||
|
||||
xor tmp, tmp
|
||||
bts tmp, len
|
||||
dec tmp
|
||||
kmovq k1, tmp
|
||||
|
||||
ENCODE_64B_3 k1
|
||||
|
||||
.exit:
|
||||
vzeroupper
|
||||
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
%endif ; if AS_FEATURE_LEVEL >= 10
|
253
erasure_code/gf_4vect_dot_prod_avx512_gfni.asm
Normal file
253
erasure_code/gf_4vect_dot_prod_avx512_gfni.asm
Normal file
@ -0,0 +1,253 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2023 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_4vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
%include "gf_vect_gfni.inc"
|
||||
|
||||
%if AS_FEATURE_LEVEL >= 10
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define tmp5 r14 ; must be saved and restored
|
||||
%define tmp6 r15 ; must be saved and restored
|
||||
|
||||
%define func(x) x: endbranch
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define tmp5 rdi ; must be saved and restored
|
||||
%define tmp6 rsi ; must be saved and restored
|
||||
%define stack_size 3*16 + 7*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + 8 + 8*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
vmovdqa [rsp + 0*16], xmm6
|
||||
vmovdqa [rsp + 1*16], xmm7
|
||||
vmovdqa [rsp + 2*16], xmm8
|
||||
mov [rsp + 3*16 + 0*8], r12
|
||||
mov [rsp + 3*16 + 1*8], r13
|
||||
mov [rsp + 3*16 + 2*8], r14
|
||||
mov [rsp + 3*16 + 3*8], r15
|
||||
mov [rsp + 3*16 + 4*8], rdi
|
||||
mov [rsp + 3*16 + 5*8], rsi
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
mov r12, [rsp + 3*16 + 0*8]
|
||||
mov r13, [rsp + 3*16 + 1*8]
|
||||
mov r14, [rsp + 3*16 + 2*8]
|
||||
mov r15, [rsp + 3*16 + 3*8]
|
||||
mov rdi, [rsp + 3*16 + 4*8]
|
||||
mov rsi, [rsp + 3*16 + 5*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest2 tmp3
|
||||
%define dest3 tmp4
|
||||
%define dest4 tmp5
|
||||
%define vskip3 tmp6
|
||||
%define pos rax
|
||||
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu8
|
||||
%define XSTR vmovdqu8
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa64
|
||||
%define XSTR vmovdqa64
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%define xgft1 zmm5
|
||||
%define xgft2 zmm6
|
||||
%define xgft3 zmm7
|
||||
%define xgft4 zmm8
|
||||
|
||||
%define x0 zmm0
|
||||
%define xp1 zmm1
|
||||
%define xp2 zmm2
|
||||
%define xp3 zmm3
|
||||
%define xp4 zmm4
|
||||
|
||||
default rel
|
||||
[bits 64]
|
||||
|
||||
section .text
|
||||
|
||||
;;
|
||||
;; Encodes 64 bytes of all "k" sources into 4x 64 bytes (parity disks)
|
||||
;;
|
||||
%macro ENCODE_64B_4 0-1
|
||||
%define %%KMASK %1
|
||||
|
||||
vpxorq xp1, xp1, xp1
|
||||
vpxorq xp2, xp2, xp2
|
||||
vpxorq xp3, xp3, xp3
|
||||
vpxorq xp4, xp4, xp4
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
%%next_vect:
|
||||
mov ptr, [src + vec_i]
|
||||
%if %0 == 1
|
||||
vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes)
|
||||
%else
|
||||
XLDR x0, [ptr + pos] ;Get next source vector (64 bytes)
|
||||
%endif
|
||||
add vec_i, 8
|
||||
|
||||
vbroadcastf32x2 xgft1, [tmp]
|
||||
vbroadcastf32x2 xgft2, [tmp + vec]
|
||||
vbroadcastf32x2 xgft3, [tmp + vec*2]
|
||||
vbroadcastf32x2 xgft4, [tmp + vskip3]
|
||||
add tmp, 8
|
||||
|
||||
GF_MUL_XOR x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3, \
|
||||
xgft4, xgft4, xp4
|
||||
|
||||
cmp vec_i, vec
|
||||
jl %%next_vect
|
||||
|
||||
%if %0 == 1
|
||||
vmovdqu8 [dest1 + pos]{%%KMASK}, xp1
|
||||
vmovdqu8 [dest2 + pos]{%%KMASK}, xp2
|
||||
vmovdqu8 [dest3 + pos]{%%KMASK}, xp3
|
||||
vmovdqu8 [dest4 + pos]{%%KMASK}, xp4
|
||||
%else
|
||||
XSTR [dest1 + pos], xp1
|
||||
XSTR [dest2 + pos], xp2
|
||||
XSTR [dest3 + pos], xp3
|
||||
XSTR [dest4 + pos], xp4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
align 16
|
||||
mk_global gf_4vect_dot_prod_avx512_gfni, function
|
||||
func(gf_4vect_dot_prod_avx512_gfni)
|
||||
FUNC_SAVE
|
||||
|
||||
xor pos, pos
|
||||
mov vskip3, vec
|
||||
imul vskip3, 8*3
|
||||
shl vec, 3 ;vec *= 8. Make vec_i count by 8
|
||||
mov dest2, [dest1 + 8]
|
||||
mov dest3, [dest1 + 2*8]
|
||||
mov dest4, [dest1 + 3*8]
|
||||
mov dest1, [dest1]
|
||||
|
||||
cmp len, 64
|
||||
jb .len_lt_64
|
||||
|
||||
.loop64:
|
||||
|
||||
ENCODE_64B_4
|
||||
|
||||
add pos, 64 ;Loop on 64 bytes at a time
|
||||
sub len, 64
|
||||
cmp len, 64
|
||||
jge .loop64
|
||||
|
||||
.len_lt_64:
|
||||
cmp len, 0
|
||||
jle .exit
|
||||
|
||||
xor tmp, tmp
|
||||
bts tmp, len
|
||||
dec tmp
|
||||
kmovq k1, tmp
|
||||
|
||||
ENCODE_64B_4 k1
|
||||
|
||||
.exit:
|
||||
vzeroupper
|
||||
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
%endif ; if AS_FEATURE_LEVEL >= 10
|
275
erasure_code/gf_5vect_dot_prod_avx512_gfni.asm
Normal file
275
erasure_code/gf_5vect_dot_prod_avx512_gfni.asm
Normal file
@ -0,0 +1,275 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2023 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_5vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
%include "gf_vect_gfni.inc"
|
||||
|
||||
%if AS_FEATURE_LEVEL >= 10
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define tmp5 r14 ; must be saved and restored
|
||||
%define tmp6 r15 ; must be saved and restored
|
||||
%define tmp7 rbp ; must be saved and restored
|
||||
|
||||
%define func(x) x: endbranch
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rbp
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop rbp
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define tmp5 rdi ; must be saved and restored
|
||||
%define tmp6 rsi ; must be saved and restored
|
||||
%define tmp7 rbp ; must be saved and restored
|
||||
%define stack_size 5*16 + 9*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + 8 + 8*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
vmovdqa [rsp + 0*16], xmm6
|
||||
vmovdqa [rsp + 1*16], xmm7
|
||||
vmovdqa [rsp + 2*16], xmm8
|
||||
vmovdqa [rsp + 3*16], xmm9
|
||||
vmovdqa [rsp + 4*16], xmm10
|
||||
mov [rsp + 5*16 + 0*8], r12
|
||||
mov [rsp + 5*16 + 1*8], r13
|
||||
mov [rsp + 5*16 + 2*8], r14
|
||||
mov [rsp + 5*16 + 3*8], r15
|
||||
mov [rsp + 5*16 + 4*8], rdi
|
||||
mov [rsp + 5*16 + 5*8], rsi
|
||||
mov [rsp + 5*16 + 6*8], rbp
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
vmovdqa xmm9, [rsp + 3*16]
|
||||
vmovdqa xmm10, [rsp + 4*16]
|
||||
mov r12, [rsp + 5*16 + 0*8]
|
||||
mov r13, [rsp + 5*16 + 1*8]
|
||||
mov r14, [rsp + 5*16 + 2*8]
|
||||
mov r15, [rsp + 5*16 + 3*8]
|
||||
mov rdi, [rsp + 5*16 + 4*8]
|
||||
mov rsi, [rsp + 5*16 + 5*8]
|
||||
mov rbp, [rsp + 5*16 + 6*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest2 tmp3
|
||||
%define dest3 tmp4
|
||||
%define dest4 tmp5
|
||||
%define vskip3 tmp6
|
||||
%define dest5 tmp7
|
||||
%define pos rax
|
||||
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu8
|
||||
%define XSTR vmovdqu8
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa64
|
||||
%define XSTR vmovdqa64
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%define xgft1 zmm6
|
||||
%define xgft2 zmm7
|
||||
%define xgft3 zmm8
|
||||
%define xgft4 zmm9
|
||||
%define xgft5 zmm10
|
||||
|
||||
%define x0 zmm0
|
||||
%define xp1 zmm1
|
||||
%define xp2 zmm2
|
||||
%define xp3 zmm3
|
||||
%define xp4 zmm4
|
||||
%define xp5 zmm5
|
||||
|
||||
default rel
|
||||
[bits 64]
|
||||
|
||||
section .text
|
||||
|
||||
;;
|
||||
;; Encodes 64 bytes of all "k" sources into 5x 64 bytes (parity disks)
|
||||
;;
|
||||
%macro ENCODE_64B_5 0-1
|
||||
%define %%KMASK %1
|
||||
|
||||
vpxorq xp1, xp1, xp1
|
||||
vpxorq xp2, xp2, xp2
|
||||
vpxorq xp3, xp3, xp3
|
||||
vpxorq xp4, xp4, xp4
|
||||
vpxorq xp5, xp5, xp5
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
%%next_vect:
|
||||
mov ptr, [src + vec_i]
|
||||
%if %0 == 1
|
||||
vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes)
|
||||
%else
|
||||
XLDR x0, [ptr + pos] ;Get next source vector (64 bytes)
|
||||
%endif
|
||||
add vec_i, 8
|
||||
|
||||
vbroadcastf32x2 xgft1, [tmp]
|
||||
vbroadcastf32x2 xgft2, [tmp + vec]
|
||||
vbroadcastf32x2 xgft3, [tmp + vec*2]
|
||||
vbroadcastf32x2 xgft4, [tmp + vskip3]
|
||||
vbroadcastf32x2 xgft5, [tmp + vec*4]
|
||||
add tmp, 8
|
||||
|
||||
GF_MUL_XOR x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3, \
|
||||
xgft4, xgft4, xp4, xgft5, xgft5, xp5
|
||||
|
||||
cmp vec_i, vec
|
||||
jl %%next_vect
|
||||
|
||||
mov ptr, [dest1] ;reuse ptr
|
||||
mov tmp, [dest1 + 5*8] ;reuse tmp
|
||||
|
||||
%if %0 == 1
|
||||
vmovdqu8 [dest1 + pos]{%%KMASK}, xp1
|
||||
vmovdqu8 [dest2 + pos]{%%KMASK}, xp2
|
||||
vmovdqu8 [dest3 + pos]{%%KMASK}, xp3
|
||||
vmovdqu8 [dest4 + pos]{%%KMASK}, xp4
|
||||
vmovdqu8 [dest5 + pos]{%%KMASK}, xp5
|
||||
%else
|
||||
XSTR [dest1 + pos], xp1
|
||||
XSTR [dest2 + pos], xp2
|
||||
XSTR [dest3 + pos], xp3
|
||||
XSTR [dest4 + pos], xp4
|
||||
XSTR [dest5 + pos], xp5
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
align 16
|
||||
mk_global gf_5vect_dot_prod_avx512_gfni, function
|
||||
func(gf_5vect_dot_prod_avx512_gfni)
|
||||
FUNC_SAVE
|
||||
|
||||
xor pos, pos
|
||||
mov vskip3, vec
|
||||
imul vskip3, 8*3
|
||||
shl vec, 3 ;vec *= 8. Make vec_i count by 8
|
||||
mov dest2, [dest1 + 8]
|
||||
mov dest3, [dest1 + 2*8]
|
||||
mov dest4, [dest1 + 3*8]
|
||||
mov dest5, [dest1 + 4*8]
|
||||
mov dest1, [dest1]
|
||||
|
||||
cmp len, 64
|
||||
jb .len_lt_64
|
||||
|
||||
.loop64:
|
||||
|
||||
ENCODE_64B_5
|
||||
|
||||
add pos, 64 ;Loop on 64 bytes at a time
|
||||
sub len, 64
|
||||
cmp len, 64
|
||||
jge .loop64
|
||||
|
||||
|
||||
.len_lt_64:
|
||||
cmp len, 0
|
||||
jle .exit
|
||||
|
||||
xor tmp, tmp
|
||||
bts tmp, len
|
||||
dec tmp
|
||||
kmovq k1, tmp
|
||||
|
||||
ENCODE_64B_5 k1
|
||||
|
||||
.exit:
|
||||
vzeroupper
|
||||
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
%endif ; if AS_FEATURE_LEVEL >= 10
|
292
erasure_code/gf_6vect_dot_prod_avx512_gfni.asm
Normal file
292
erasure_code/gf_6vect_dot_prod_avx512_gfni.asm
Normal file
@ -0,0 +1,292 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2023 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_6vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
%include "gf_vect_gfni.inc"
|
||||
|
||||
%if AS_FEATURE_LEVEL >= 10
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define tmp5 r14 ; must be saved and restored
|
||||
%define tmp6 r15 ; must be saved and restored
|
||||
%define tmp7 rbp ; must be saved and restored
|
||||
%define tmp8 rbx ; must be saved and restored
|
||||
|
||||
%define func(x) x: endbranch
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rbp
|
||||
push rbx
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop rbx
|
||||
pop rbp
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define tmp5 rdi ; must be saved and restored
|
||||
%define tmp6 rsi ; must be saved and restored
|
||||
%define tmp7 rbp ; must be saved and restored
|
||||
%define tmp8 rbx ; must be saved and restored
|
||||
%define stack_size 7*16 + 9*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + 8 + 8*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
vmovdqa [rsp + 0*16], xmm6
|
||||
vmovdqa [rsp + 1*16], xmm7
|
||||
vmovdqa [rsp + 2*16], xmm8
|
||||
vmovdqa [rsp + 3*16], xmm9
|
||||
vmovdqa [rsp + 4*16], xmm10
|
||||
vmovdqa [rsp + 5*16], xmm11
|
||||
vmovdqa [rsp + 6*16], xmm12
|
||||
mov [rsp + 7*16 + 0*8], r12
|
||||
mov [rsp + 7*16 + 1*8], r13
|
||||
mov [rsp + 7*16 + 2*8], r14
|
||||
mov [rsp + 7*16 + 3*8], r15
|
||||
mov [rsp + 7*16 + 4*8], rdi
|
||||
mov [rsp + 7*16 + 5*8], rsi
|
||||
mov [rsp + 7*16 + 6*8], rbp
|
||||
mov [rsp + 7*16 + 7*8], rbx
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
vmovdqa xmm9, [rsp + 3*16]
|
||||
vmovdqa xmm10, [rsp + 4*16]
|
||||
vmovdqa xmm11, [rsp + 5*16]
|
||||
vmovdqa xmm12, [rsp + 6*16]
|
||||
mov r12, [rsp + 7*16 + 0*8]
|
||||
mov r13, [rsp + 7*16 + 1*8]
|
||||
mov r14, [rsp + 7*16 + 2*8]
|
||||
mov r15, [rsp + 7*16 + 3*8]
|
||||
mov rdi, [rsp + 7*16 + 4*8]
|
||||
mov rsi, [rsp + 7*16 + 5*8]
|
||||
mov rbp, [rsp + 7*16 + 6*8]
|
||||
mov rbx, [rsp + 7*16 + 7*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest2 tmp3
|
||||
%define dest3 tmp4
|
||||
%define dest4 tmp5
|
||||
%define vskip3 tmp6
|
||||
%define dest5 tmp7
|
||||
%define vskip5 tmp8
|
||||
%define pos rax
|
||||
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu8
|
||||
%define XSTR vmovdqu8
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa64
|
||||
%define XSTR vmovdqa64
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%define xgft1 zmm7
|
||||
%define xgft2 zmm8
|
||||
%define xgft3 zmm9
|
||||
%define xgft4 zmm10
|
||||
%define xgft5 zmm11
|
||||
%define xgft6 zmm12
|
||||
|
||||
%define x0 zmm0
|
||||
%define xp1 zmm1
|
||||
%define xp2 zmm2
|
||||
%define xp3 zmm3
|
||||
%define xp4 zmm4
|
||||
%define xp5 zmm5
|
||||
%define xp6 zmm6
|
||||
|
||||
default rel
|
||||
[bits 64]
|
||||
|
||||
section .text
|
||||
|
||||
;;
|
||||
;; Encodes 64 bytes of all "k" sources into 6x 64 bytes (parity disks)
|
||||
;;
|
||||
%macro ENCODE_64B_6 0-1
|
||||
%define %%KMASK %1
|
||||
|
||||
vpxorq xp1, xp1, xp1
|
||||
vpxorq xp2, xp2, xp2
|
||||
vpxorq xp3, xp3, xp3
|
||||
vpxorq xp4, xp4, xp4
|
||||
vpxorq xp5, xp5, xp5
|
||||
vpxorq xp6, xp6, xp6
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
%%next_vect:
|
||||
mov ptr, [src + vec_i]
|
||||
%if %0 == 1
|
||||
vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes)
|
||||
%else
|
||||
XLDR x0, [ptr + pos] ;Get next source vector (64 bytes)
|
||||
%endif
|
||||
add vec_i, 8
|
||||
|
||||
vbroadcastf32x2 xgft1, [tmp]
|
||||
vbroadcastf32x2 xgft2, [tmp + vec]
|
||||
vbroadcastf32x2 xgft3, [tmp + vec*2]
|
||||
vbroadcastf32x2 xgft4, [tmp + vskip3]
|
||||
vbroadcastf32x2 xgft5, [tmp + vec*4]
|
||||
vbroadcastf32x2 xgft6, [tmp + vskip5]
|
||||
add tmp, 8
|
||||
|
||||
GF_MUL_XOR x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2, xgft3, xgft3, xp3, \
|
||||
xgft4, xgft4, xp4, xgft5, xgft5, xp5, xgft6, xgft6, xp6
|
||||
|
||||
cmp vec_i, vec
|
||||
jl %%next_vect
|
||||
|
||||
mov ptr, [dest1] ;reuse ptr
|
||||
mov tmp, [dest1 + 5*8] ;reuse tmp
|
||||
|
||||
%if %0 == 1
|
||||
vmovdqu8 [dest2 + pos]{%%KMASK}, xp2
|
||||
vmovdqu8 [dest3 + pos]{%%KMASK}, xp3
|
||||
vmovdqu8 [dest4 + pos]{%%KMASK}, xp4
|
||||
vmovdqu8 [dest5 + pos]{%%KMASK}, xp5
|
||||
vmovdqu8 [ptr + pos]{%%KMASK}, xp1 ; dest 1
|
||||
vmovdqu8 [tmp + pos]{%%KMASK}, xp6 ; dest 6
|
||||
%else
|
||||
XSTR [dest2 + pos], xp2
|
||||
XSTR [dest3 + pos], xp3
|
||||
XSTR [dest4 + pos], xp4
|
||||
XSTR [dest5 + pos], xp5
|
||||
XSTR [ptr + pos], xp1 ; dest 1
|
||||
XSTR [tmp + pos], xp6 ; dest 6
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
align 16
|
||||
mk_global gf_6vect_dot_prod_avx512_gfni, function
|
||||
func(gf_6vect_dot_prod_avx512_gfni)
|
||||
FUNC_SAVE
|
||||
|
||||
xor pos, pos
|
||||
mov vskip3, vec
|
||||
imul vskip3, 3*8
|
||||
mov vskip5, vec
|
||||
imul vskip5, 5*8
|
||||
shl vec, 3 ;vec *= 8. Make vec_i count by 8
|
||||
mov dest2, [dest1 + 8]
|
||||
mov dest3, [dest1 + 2*8]
|
||||
mov dest4, [dest1 + 3*8]
|
||||
mov dest5, [dest1 + 4*8] ;dest1 and dest6 are calculated later
|
||||
|
||||
cmp len, 64
|
||||
jb .len_lt_64
|
||||
|
||||
.loop64:
|
||||
|
||||
ENCODE_64B_6
|
||||
|
||||
add pos, 64 ;Loop on 64 bytes at a time
|
||||
sub len, 64
|
||||
cmp len, 64
|
||||
jge .loop64
|
||||
|
||||
.len_lt_64:
|
||||
cmp len, 0
|
||||
jle .exit
|
||||
|
||||
xor tmp, tmp
|
||||
bts tmp, len
|
||||
dec tmp
|
||||
kmovq k1, tmp
|
||||
|
||||
ENCODE_64B_6 k1
|
||||
|
||||
.exit:
|
||||
vzeroupper
|
||||
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
%endif ; if AS_FEATURE_LEVEL >= 10
|
190
erasure_code/gf_vect_dot_prod_avx512_gfni.asm
Normal file
190
erasure_code/gf_vect_dot_prod_avx512_gfni.asm
Normal file
@ -0,0 +1,190 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2023 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_vect_dot_prod_avx512_gfni(len, vec, *g_tbls, **buffs, *dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
%include "gf_vect_gfni.inc"
|
||||
|
||||
%if AS_FEATURE_LEVEL >= 10
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
|
||||
%define func(x) x: endbranch
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r13 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define stack_size 0*16 + 3*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + 8 + 8*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
mov [rsp + 0*8], r12
|
||||
mov [rsp + 1*8], r13
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
mov r12, [rsp + 0*8]
|
||||
mov r13, [rsp + 1*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define pos rax
|
||||
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu8
|
||||
%define XSTR vmovdqu8
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa64
|
||||
%define XSTR vmovdqa64
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%define xgft1 zmm2
|
||||
|
||||
%define x0 zmm0
|
||||
%define xp1 zmm1
|
||||
|
||||
default rel
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
;;
|
||||
;; Encodes 64 bytes of all "k" sources into 64 bytes (single parity disk)
|
||||
;;
|
||||
%macro ENCODE_64B 0-1
|
||||
%define %%KMASK %1
|
||||
|
||||
vpxorq xp1, xp1, xp1
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
%%next_vect:
|
||||
mov ptr, [src + vec_i]
|
||||
%if %0 == 1
|
||||
vmovdqu8 x0{%%KMASK}, [ptr + pos] ;Get next source vector (less than 64 bytes)
|
||||
%else
|
||||
XLDR x0, [ptr + pos] ;Get next source vector (64 bytes)
|
||||
%endif
|
||||
add vec_i, 8
|
||||
|
||||
vbroadcastf32x2 xgft1, [tmp]
|
||||
add tmp, 8
|
||||
|
||||
GF_MUL_XOR x0, xgft1, xgft1, xp1
|
||||
|
||||
cmp vec_i, vec
|
||||
jl %%next_vect
|
||||
|
||||
%if %0 == 1
|
||||
vmovdqu8 [dest1 + pos]{%%KMASK}, xp1
|
||||
%else
|
||||
XSTR [dest1 + pos], xp1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
align 16
|
||||
mk_global gf_vect_dot_prod_avx512_gfni, function
|
||||
func(gf_vect_dot_prod_avx512_gfni)
|
||||
FUNC_SAVE
|
||||
xor pos, pos
|
||||
shl vec, 3 ;vec *= 8. Make vec_i count by 8
|
||||
|
||||
cmp len, 64
|
||||
jb .len_lt_64
|
||||
|
||||
.loop64:
|
||||
|
||||
ENCODE_64B
|
||||
|
||||
add pos, 64 ;Loop on 64 bytes at a time
|
||||
sub len, 64
|
||||
cmp len, 64
|
||||
jge .loop64
|
||||
|
||||
.len_lt_64:
|
||||
cmp len, 0
|
||||
jle .exit
|
||||
|
||||
xor tmp, tmp
|
||||
bts tmp, len
|
||||
dec tmp
|
||||
kmovq k1, tmp
|
||||
|
||||
ENCODE_64B k1
|
||||
|
||||
.exit:
|
||||
vzeroupper
|
||||
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
%endif ; if AS_FEATURE_LEVEL >= 10
|
67
erasure_code/gf_vect_gfni.inc
Normal file
67
erasure_code/gf_vect_gfni.inc
Normal file
@ -0,0 +1,67 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2023 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;
|
||||
; Multiply 1 source register to up to 6 different GF table registers
|
||||
; and XOR the results to partial registers
|
||||
;
|
||||
%macro GF_MUL_XOR 4-19
|
||||
%define %%SRC %1
|
||||
%define %%GFTABLE1 %2
|
||||
%define %%TMP1 %3
|
||||
%define %%PARTIAL1 %4
|
||||
%define %%GFTABLE2 %5
|
||||
%define %%TMP2 %6
|
||||
%define %%PARTIAL2 %7
|
||||
%define %%GFTABLE3 %8
|
||||
%define %%TMP3 %9
|
||||
%define %%PARTIAL3 %10
|
||||
%define %%GFTABLE4 %11
|
||||
%define %%TMP4 %12
|
||||
%define %%PARTIAL4 %13
|
||||
%define %%GFTABLE5 %14
|
||||
%define %%TMP5 %15
|
||||
%define %%PARTIAL5 %16
|
||||
%define %%GFTABLE6 %17
|
||||
%define %%TMP6 %18
|
||||
%define %%PARTIAL6 %19
|
||||
|
||||
%define %%N_BLOCKS ((%0 - 1) / 3)
|
||||
|
||||
%assign %%I 1
|
||||
%rep %%N_BLOCKS
|
||||
vgf2p8affineqb %%TMP %+ %%I, %%SRC, %%GFTABLE %+ %%I, 0x00
|
||||
%assign %%I (%%I + 1)
|
||||
%endrep
|
||||
%assign %%I 1
|
||||
%rep %%N_BLOCKS
|
||||
vpxorq %%PARTIAL %+ %%I, %%TMP %+ %%I
|
||||
%assign %%I (%%I + 1)
|
||||
%endrep
|
||||
%endmacro
|
Loading…
Reference in New Issue
Block a user