Add avx512 versions of ec_encode_data

- Includes gf_nvect_dot_prod, gf_nvect_mad functions - Change ec multibinary to use common macros - Autoconf checks for nasm or yasm support and picks if available - Leave out compile of any avx512 code if assembler not available Signed-off-by: Greg Tucker <greg.b.tucker@intel.com>
2025-11-01 11:52:52 +01:00 · 2015-11-18 15:40:52 -07:00
parent bc4dfc9bbc
commit a5b324d2cd
17 changed files with 2512 additions and 342 deletions
--- a/Makefile.nmake
+++ b/Makefile.nmake
@@ -27,9 +27,9 @@
 #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ########################################################################

-objs = bin\ec_base.obj bin\ec_highlevel_func.obj bin\ec_multibinary.obj bin\gf_2vect_dot_prod_avx.obj bin\gf_2vect_dot_prod_avx2.obj bin\gf_2vect_dot_prod_sse.obj bin\gf_2vect_mad_avx.obj bin\gf_2vect_mad_avx2.obj bin\gf_2vect_mad_sse.obj bin\gf_3vect_dot_prod_avx.obj bin\gf_3vect_dot_prod_avx2.obj bin\gf_3vect_dot_prod_sse.obj bin\gf_3vect_mad_avx.obj bin\gf_3vect_mad_avx2.obj bin\gf_3vect_mad_sse.obj bin\gf_4vect_dot_prod_avx.obj bin\gf_4vect_dot_prod_avx2.obj bin\gf_4vect_dot_prod_sse.obj bin\gf_4vect_mad_avx.obj bin\gf_4vect_mad_avx2.obj bin\gf_4vect_mad_sse.obj bin\gf_5vect_dot_prod_avx.obj bin\gf_5vect_dot_prod_avx2.obj bin\gf_5vect_dot_prod_sse.obj bin\gf_5vect_mad_avx.obj bin\gf_5vect_mad_avx2.obj bin\gf_5vect_mad_sse.obj bin\gf_6vect_dot_prod_avx.obj bin\gf_6vect_dot_prod_avx2.obj bin\gf_6vect_dot_prod_sse.obj bin\gf_6vect_mad_avx.obj bin\gf_6vect_mad_avx2.obj bin\gf_6vect_mad_sse.obj bin\gf_vect_dot_prod_avx.obj bin\gf_vect_dot_prod_avx2.obj bin\gf_vect_dot_prod_sse.obj bin\gf_vect_mad_avx.obj bin\gf_vect_mad_avx2.obj bin\gf_vect_mad_sse.obj bin\gf_vect_mul_avx.obj bin\gf_vect_mul_sse.obj
+objs = bin\ec_base.obj bin\ec_highlevel_func.obj bin\ec_multibinary.obj bin\gf_2vect_dot_prod_avx.obj bin\gf_2vect_dot_prod_avx2.obj bin\gf_2vect_dot_prod_avx512.obj bin\gf_2vect_dot_prod_sse.obj bin\gf_2vect_mad_avx.obj bin\gf_2vect_mad_avx2.obj bin\gf_2vect_mad_avx512.obj bin\gf_2vect_mad_sse.obj bin\gf_3vect_dot_prod_avx.obj bin\gf_3vect_dot_prod_avx2.obj bin\gf_3vect_dot_prod_avx512.obj bin\gf_3vect_dot_prod_sse.obj bin\gf_3vect_mad_avx.obj bin\gf_3vect_mad_avx2.obj bin\gf_3vect_mad_avx512.obj bin\gf_3vect_mad_sse.obj bin\gf_4vect_dot_prod_avx.obj bin\gf_4vect_dot_prod_avx2.obj bin\gf_4vect_dot_prod_avx512.obj bin\gf_4vect_dot_prod_sse.obj bin\gf_4vect_mad_avx.obj bin\gf_4vect_mad_avx2.obj bin\gf_4vect_mad_avx512.obj bin\gf_4vect_mad_sse.obj bin\gf_5vect_dot_prod_avx.obj bin\gf_5vect_dot_prod_avx2.obj bin\gf_5vect_dot_prod_sse.obj bin\gf_5vect_mad_avx.obj bin\gf_5vect_mad_avx2.obj bin\gf_5vect_mad_sse.obj bin\gf_6vect_dot_prod_avx.obj bin\gf_6vect_dot_prod_avx2.obj bin\gf_6vect_dot_prod_sse.obj bin\gf_6vect_mad_avx.obj bin\gf_6vect_mad_avx2.obj bin\gf_6vect_mad_sse.obj bin\gf_vect_dot_prod_avx.obj bin\gf_vect_dot_prod_avx2.obj bin\gf_vect_dot_prod_avx512.obj bin\gf_vect_dot_prod_sse.obj bin\gf_vect_mad_avx.obj bin\gf_vect_mad_avx2.obj bin\gf_vect_mad_avx512.obj bin\gf_vect_mad_sse.obj bin\gf_vect_mul_avx.obj bin\gf_vect_mul_sse.obj

-INCLUDES  = -I. -Ierasure_code -Iinclude
+INCLUDES  = -I./ -Ierasure_code/ -Iinclude/
 LINKFLAGS = /nologo
 CFLAGS   = -O2 -D NDEBUG /nologo -D_USE_MATH_DEFINES -Qstd=c99 $(INCLUDES) $(D)
 AFLAGS   = -f win64 $(INCLUDES) $(D)
--- a/Release_notes.txt
+++ b/Release_notes.txt
@@ -1,5 +1,5 @@
 =============================================================================
-v2.14 Intel Intelligent Storage Acceleration Library Release Notes
+v2.15 Intel Intelligent Storage Acceleration Library Release Notes
      Open Source Version
 =============================================================================

@@ -23,6 +23,12 @@ RELEASE NOTE CONTENTS
 =============================================================================
 2. FIXED ISSUES
 =============================================================================
+v2.15
+
+* Fix for windows register save in gf_6vect_mad_avx2.asm.  Only affects windows
+  versions of ec_encode_data_update() running with AVX2.  A GP register was not
+  properly restored resulting in corruption on return.
+
 v2.14

 * Building in unit directories is no longer supported removing the issue of
@@ -37,6 +43,14 @@ v2.10
 =============================================================================
 3. CHANGE LOG & FEATURES ADDED 
 =============================================================================
+v2.15
+
+* Erasure code updates. New AVX512 versions.
+
+* Nasm support.  ISA-L ported to build with nasm or yasm assembler.
+
+* Windows DLL support.  Windows builds DLL by default.
+
 v2.14

 * Autoconf and autotools build allows easier porting to additional systems.
--- a/configure.ac
+++ b/configure.ac
@@ -3,7 +3,7 @@

 AC_PREREQ(2.69)
 AC_INIT([libisal],
-        [2.14.0],
+        [2.15.0],
        [sg.support.isal@intel.com],
        [isa-l],
        [http://01.org/storage-acceleration-library])
@@ -30,17 +30,6 @@ LT_INIT
 AC_PREFIX_DEFAULT([/usr])
 AC_PROG_SED
 AC_PROG_MKDIR_P
-AC_CHECK_PROG(HAVE_YASM, yasm, yes, no)
-if test "$HAVE_YASM" = "no"; then
-  AC_MSG_ERROR([yasm not found as required.])
-fi
-AC_MSG_CHECKING([checking for modern yasm])
-AC_LANG_CONFTEST([AC_LANG_SOURCE([[vmovdqa %xmm0, %xmm1;]])])
-if yasm -f elf64 -p gas conftest.c ; then
-   AC_MSG_RESULT([yes])
-else
-   AC_MSG_FAILURE([need modern yasm])
-fi

 # Options
 AC_ARG_ENABLE([debug],
@@ -50,6 +39,80 @@ AS_IF([test "x$enable_debug" = "xyes"], [
        AC_DEFINE(ENABLE_DEBUG, [1], [Debug messages.])
 ])

+# Check for yasm and yasm features
+AC_CHECK_PROG(HAVE_YASM, yasm, yes, no)
+if test "$HAVE_YASM" = "no"; then
+  AC_MSG_RESULT([no yasm])
+else
+  AC_MSG_CHECKING([for modern yasm])
+  AC_LANG_CONFTEST([AC_LANG_SOURCE([[vmovdqa %xmm0, %xmm1;]])])
+  if yasm -f elf64 -p gas conftest.c ; then
+    with_modern_yasm=yes
+    AC_MSG_RESULT([yes])
+    AC_MSG_CHECKING([for optional yasm AVX512 support])
+    AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpshufb %zmm0, %zmm1, %zmm2;]])])
+    if yasm -f elf64 -p gas conftest.c 2> /dev/null; then
+      yasm_knows_avx512=yes
+      AC_MSG_RESULT([yes])
+    else
+      AC_MSG_RESULT([no])
+    fi
+  else
+    AC_MSG_FAILURE([no])
+  fi
+fi
+
+# Check for nasm and nasm features
+AC_CHECK_PROG(HAVE_NASM, nasm, yes, no)
+if test "$HAVE_NASM" = "no"; then
+  AC_MSG_RESULT([no nasm])
+else
+  AC_MSG_CHECKING([for modern nasm])
+  AC_LANG_CONFTEST([AC_LANG_SOURCE([[pblendvb xmm2, xmm1;]])])
+  sed -i -e '/pblendvb/!d' conftest.c
+  if nasm -f elf64 conftest.c 2> /dev/null; then
+    with_modern_nasm=yes
+    AC_MSG_RESULT([yes])
+    AC_MSG_CHECKING([for optional nasm AVX512 support])
+    AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpshufb zmm0, zmm1, zmm2;]])])
+    sed -i -e '/vpshufb/!d' conftest.c
+    if nasm -f elf64  conftest.c 2> /dev/null; then
+      nasm_knows_avx512=yes
+      AC_MSG_RESULT([yes])
+    else
+      AC_MSG_RESULT([no])
+    fi
+  else
+    AC_MSG_RESULT([no])
+  fi
+fi
+
+# Pick an assembler yasm or nasm
+if test x"$AS" == x""; then
+  if test x"$yasm_knows_avx512" = x"yes"; then
+    AS=yasm
+  elif test x"$nasm_knows_avx512" = x"yes"; then
+    AS=nasm
+  elif test x"$with_modern_yasm" = x"yes"; then
+    AS=yasm
+  elif test x"$with_modern_nasm" = x"yes"; then
+    AS=nasm
+  else
+    AC_MSG_ERROR([No modern yasm or nasm found as required. Yasm should be 1.2.0 or later, and nasm should be v2.11.01 or later.])
+  fi
+fi
+echo "Using assembler $AS"
+
+if test \( x"$AS" = x"yasm" -a x"$yasm_knows_avx512" = x"yes" \) -o \( x"$AS" = x"nasm" -a x"$nasm_knows_avx512" = x"yes" \); then
+  AC_DEFINE(HAVE_AS_KNOWS_AVX512, [1], [Assembler can do AVX512.])
+  have_as_knows_avx512=yes
+else
+  AC_MSG_RESULT([Assembler does not understand AVX512 opcodes.  Consider upgrading for best performance.])
+fi
+AM_CONDITIONAL(USE_YASM, test x"$AS" = x"yasm")
+AM_CONDITIONAL(USE_NASM, test x"$AS" = x"nasm")
+AM_CONDITIONAL(WITH_AVX512, test x"$have_as_knows_avx512" = x"yes")
+

 case $target in
     *linux*)	arch=linux   yasm_args="-f elf64";;
@@ -62,7 +125,6 @@ AM_CONDITIONAL(DARWIN, test x"$arch" = x"darwin")
 AC_MSG_RESULT([Using yasm args target "$arch" "$yasm_args"])

 # Check for header files
-#AC_CHECK_HEADERS([limits.h stddef.h stdint.h stdlib.h string.h sys/time.h unistd.h])
 AC_CHECK_HEADERS([limits.h stdint.h stdlib.h string.h])

 # Checks for typedefs, structures, and compiler characteristics.
--- a/erasure_code/Makefile.am
+++ b/erasure_code/Makefile.am
@@ -69,6 +69,16 @@ lsrc         += erasure_code/ec_highlevel_func.c \
 		erasure_code/gf_6vect_mad_avx2.asm \
 		erasure_code/ec_multibinary.asm

+#if HAVE_AVX512
+lsrc +=		erasure_code/gf_vect_dot_prod_avx512.asm \
+		erasure_code/gf_2vect_dot_prod_avx512.asm \
+		erasure_code/gf_3vect_dot_prod_avx512.asm \
+		erasure_code/gf_4vect_dot_prod_avx512.asm \
+		erasure_code/gf_vect_mad_avx512.asm \
+		erasure_code/gf_2vect_mad_avx512.asm \
+		erasure_code/gf_3vect_mad_avx512.asm \
+		erasure_code/gf_4vect_mad_avx512.asm
+
 lsrc32	     += erasure_code/ec_highlevel_func.c \
 		erasure_code/ec_multibinary.asm \
 		erasure_code/ec_base.c \
@@ -85,7 +95,7 @@ lsrc32	     += erasure_code/ec_highlevel_func.c \
 		erasure_code/gf_3vect_dot_prod_avx2.asm \
 		erasure_code/gf_4vect_dot_prod_avx2.asm

-unit_tests32 += erasure_code_base_test \
+unit_tests32 += erasure_code/erasure_code_base_test \
 		erasure_code/erasure_code_test \
 		erasure_code/erasure_code_sse_test \
 		erasure_code/gf_vect_mul_test \
@@ -114,6 +124,7 @@ extern_hdrs  += include/erasure_code.h \
 		include/gf_vect_mul.h

 other_src    += erasure_code/ec_base.h \
+		include/multibinary.asm \
 		include/reg_sizes.asm

 check_tests  += erasure_code/gf_vect_mul_test \
--- a/erasure_code/ec_base.c
+++ b/erasure_code/ec_base.c
@@ -351,10 +351,10 @@ struct slver gf_mul_slver_00000214;
 struct slver gf_mul_slver = { 0x0214, 0x00, 0x00 };

 struct slver gf_invert_matrix_slver_00000215;
-struct slver gf_invert_matrix_slver = { 0x0215, 0x00, 0x00};
+struct slver gf_invert_matrix_slver = { 0x0215, 0x00, 0x00 };

 struct slver gf_gen_rs_matrix_slver_00000216;
 struct slver gf_gen_rs_matrix_slver = { 0x0216, 0x00, 0x00 };

 struct slver gf_gen_cauchy1_matrix_slver_00000217;
-struct slver gf_gen_cauchy1_matrix_slver = { 0x0217, 0x00, 0x00};
+struct slver gf_gen_cauchy1_matrix_slver = { 0x0217, 0x00, 0x00 };
--- a/erasure_code/ec_highlevel_func.c
+++ b/erasure_code/ec_highlevel_func.c
@@ -134,6 +134,86 @@ void ec_encode_data_avx2(int len, int k, int rows, unsigned char *g_tbls, unsign

 }

+#ifdef HAVE_AS_KNOWS_AVX512
+
+extern int gf_vect_dot_prod_avx512(int len, int k, unsigned char *g_tbls, unsigned char **data,
+				   unsigned char *dest);
+extern int gf_2vect_dot_prod_avx512(int len, int k, unsigned char *g_tbls,
+				    unsigned char **data, unsigned char **coding);
+extern int gf_3vect_dot_prod_avx512(int len, int k, unsigned char *g_tbls,
+				    unsigned char **data, unsigned char **coding);
+extern int gf_4vect_dot_prod_avx512(int len, int k, unsigned char *g_tbls,
+				    unsigned char **data, unsigned char **coding);
+extern void gf_vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls,
+			       unsigned char *src, unsigned char *dest);
+extern void gf_2vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls,
+				unsigned char *src, unsigned char **dest);
+extern void gf_3vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls,
+				unsigned char *src, unsigned char **dest);
+extern void gf_4vect_mad_avx512(int len, int vec, int vec_i, unsigned char *gftbls,
+				unsigned char *src, unsigned char **dest);
+
+void ec_encode_data_avx512(int len, int k, int rows, unsigned char *g_tbls,
+			   unsigned char **data, unsigned char **coding)
+{
+
+	if (len < 64) {
+		ec_encode_data_base(len, k, rows, g_tbls, data, coding);
+		return;
+	}
+
+	while (rows >= 4) {
+		gf_4vect_dot_prod_avx512(len, k, g_tbls, data, coding);
+		g_tbls += 4 * k * 32;
+		coding += 4;
+		rows -= 4;
+	}
+	switch (rows) {
+	case 3:
+		gf_3vect_dot_prod_avx512(len, k, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_dot_prod_avx512(len, k, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_dot_prod_avx512(len, k, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+}
+
+void ec_encode_data_update_avx512(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+				  unsigned char *data, unsigned char **coding)
+{
+	if (len < 64) {
+		ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+		return;
+	}
+
+	while (rows >= 4) {
+		gf_4vect_mad_avx512(len, k, vec_i, g_tbls, data, coding);
+		g_tbls += 4 * k * 32;
+		coding += 4;
+		rows -= 4;
+	}
+	switch (rows) {
+	case 3:
+		gf_3vect_mad_avx512(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_mad_avx512(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_mad_avx512(len, k, vec_i, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+}
+
+#endif // HAVE_AS_KNOWS_AVX512
+
 #if __WORDSIZE == 64 || _WIN64 || __x86_64__

 void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
--- a/erasure_code/ec_multibinary.asm
+++ b/erasure_code/ec_multibinary.asm
@@ -34,36 +34,23 @@
 %endif

 %include "reg_sizes.asm"
+%include "multibinary.asm"

 %ifidn __OUTPUT_FORMAT__, elf32
-
-[bits 32]
-
- %define def_wrd		dd
- %define wrd_sz  	dword
- %define arg1		esi
- %define arg2		eax
- %define arg3		ebx
- %define arg4		ecx
- %define arg5		edx
-
+ [bits 32]
 %else
-
 default rel
 [bits 64]

- %define def_wrd 	dq
- %define wrd_sz  	qword
- %define arg1		rsi
- %define arg2		rax
- %define arg3		rbx
- %define arg4		rcx
- %define arg5		rdx
-
-
 extern ec_encode_data_update_sse
 extern ec_encode_data_update_avx
 extern ec_encode_data_update_avx2
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern ec_encode_data_avx512
+ extern gf_vect_dot_prod_avx512
+ extern ec_encode_data_update_avx512
+ extern gf_vect_mad_avx512
+%endif
 extern gf_vect_mul_sse
 extern gf_vect_mul_avx

@@ -85,311 +72,38 @@ extern ec_encode_data_sse
 extern ec_encode_data_avx
 extern ec_encode_data_avx2

+mbin_interface ec_encode_data
+mbin_interface gf_vect_dot_prod
+mbin_interface gf_vect_mul
+mbin_interface ec_encode_data_update
+mbin_interface gf_vect_mad

-section .data
-;;; *_mbinit are initial values for *_dispatched; is updated on first call.
-;;; Therefore, *_dispatch_init is only executed on first call.
-
-ec_encode_data_dispatched:
-	def_wrd      ec_encode_data_mbinit
-
-gf_vect_mul_dispatched:
-	def_wrd      gf_vect_mul_mbinit
-
-gf_vect_dot_prod_dispatched:
-	def_wrd      gf_vect_dot_prod_mbinit
-
-ec_encode_data_update_dispatched:
-	def_wrd      ec_encode_data_update_mbinit
-
-gf_vect_mad_dispatched:
-	def_wrd      gf_vect_mad_mbinit
-
-section .text
-;;;;
-; ec_encode_data multibinary function
-;;;;
-global ec_encode_data:function
-ec_encode_data_mbinit:
-	call	ec_encode_data_dispatch_init
-
-ec_encode_data:
-	jmp	wrd_sz [ec_encode_data_dispatched]
-
-ec_encode_data_dispatch_init:
-	push    arg1
-	push    arg2
-	push    arg3
-	push    arg4
-	push    arg5
-	lea     arg1, [ec_encode_data_base WRT_OPT] ; Default
-
-	mov     eax, 1
-	cpuid
-	lea     arg3, [ec_encode_data_sse WRT_OPT]
-	test    ecx, FLAG_CPUID1_ECX_SSE4_1
-	cmovne  arg1, arg3
-
-	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
-	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
-	lea	arg3, [ec_encode_data_avx WRT_OPT]
-
-	jne	_done_ec_encode_data_init
-	mov	arg1, arg3
-
-	;; Try for AVX2
-	xor	ecx, ecx
-	mov	eax, 7
-	cpuid
-	test	ebx, FLAG_CPUID1_EBX_AVX2
-	lea     arg3, [ec_encode_data_avx2 WRT_OPT]
-	cmovne	arg1, arg3
-	;; Does it have xmm and ymm support
-	xor	ecx, ecx
-	xgetbv
-	and	eax, FLAG_XGETBV_EAX_XMM_YMM
-	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
-	je	_done_ec_encode_data_init
-	lea     arg1, [ec_encode_data_sse WRT_OPT]
-
-_done_ec_encode_data_init:
-	pop     arg5
-	pop     arg4
-	pop     arg3
-	pop     arg2
-	mov     [ec_encode_data_dispatched], arg1
-	pop     arg1
-	ret
-
-;;;;
-; gf_vect_mul multibinary function
-;;;;
-global gf_vect_mul:function
-gf_vect_mul_mbinit:
-	call    gf_vect_mul_dispatch_init
-
-gf_vect_mul:
-	jmp	wrd_sz [gf_vect_mul_dispatched]
-
-gf_vect_mul_dispatch_init:
-	push    arg1
-%ifidn __OUTPUT_FORMAT__, elf32		;; 32-bit check
-	lea     arg1, [gf_vect_mul_base]
+%ifidn __OUTPUT_FORMAT__, elf32
+ mbin_dispatch_init5 ec_encode_data, ec_encode_data_base, ec_encode_data_sse, ec_encode_data_avx, ec_encode_data_avx2
+ mbin_dispatch_init5 gf_vect_dot_prod, gf_vect_dot_prod_base, gf_vect_dot_prod_sse, gf_vect_dot_prod_avx, gf_vect_dot_prod_avx2
+ mbin_dispatch_init2 gf_vect_mul, gf_vect_mul_base
+ mbin_dispatch_init2 ec_encode_data_update, ec_encode_data_update_base
+ mbin_dispatch_init2 gf_vect_mad, gf_vect_mad_base
 %else
-	push    rax
-	push    rbx
-	push    rcx
-	push    rdx
-	lea     arg1, [gf_vect_mul_base WRT_OPT] ; Default

-	mov     eax, 1
-	cpuid
-	test    ecx, FLAG_CPUID1_ECX_SSE4_2
-	lea     rbx, [gf_vect_mul_sse WRT_OPT]
-	je	_done_gf_vect_mul_dispatch_init
-	mov  	arg1, rbx
+ mbin_dispatch_init5 gf_vect_mul, gf_vect_mul_base, gf_vect_mul_sse, gf_vect_mul_avx, gf_vect_mul_avx

-	;; Try for AVX
-	and     ecx, (FLAG_CPUID1_ECX_OSXSAVE | FLAG_CPUID1_ECX_AVX)
-	cmp     ecx, (FLAG_CPUID1_ECX_OSXSAVE | FLAG_CPUID1_ECX_AVX)
-	jne     _done_gf_vect_mul_dispatch_init
-
-	;; Does it have xmm and ymm support
-	xor     ecx, ecx
-	xgetbv
-	and     eax, FLAG_XGETBV_EAX_XMM_YMM
-	cmp     eax, FLAG_XGETBV_EAX_XMM_YMM
-	jne     _done_gf_vect_mul_dispatch_init
-	lea     arg1, [gf_vect_mul_avx WRT_OPT]
-
-_done_gf_vect_mul_dispatch_init:
-	pop     rdx
-	pop     rcx
-	pop     rbx
-	pop     rax
-%endif			;; END 32-bit check
-	mov     [gf_vect_mul_dispatched], arg1
-	pop     arg1
-	ret
-
-;;;;
-; ec_encode_data_update multibinary function
-;;;;
-global ec_encode_data_update:function
-ec_encode_data_update_mbinit:
-	call	ec_encode_data_update_dispatch_init
-
-ec_encode_data_update:
-	jmp	wrd_sz [ec_encode_data_update_dispatched]
-
-ec_encode_data_update_dispatch_init:
-	push    arg1
-%ifidn __OUTPUT_FORMAT__, elf32		;; 32-bit check
-	lea     arg1, [ec_encode_data_update_base]
+%ifdef HAVE_AS_KNOWS_AVX512
+  mbin_dispatch_init6 ec_encode_data, ec_encode_data_base, ec_encode_data_sse, ec_encode_data_avx, ec_encode_data_avx2, ec_encode_data_avx512
+  mbin_dispatch_init6 ec_encode_data_update, ec_encode_data_update_base, ec_encode_data_update_sse, ec_encode_data_update_avx, ec_encode_data_update_avx2, ec_encode_data_update_avx512
+  mbin_dispatch_init6 gf_vect_mad, gf_vect_mad_base, gf_vect_mad_sse, gf_vect_mad_avx, gf_vect_mad_avx2, gf_vect_mad_avx512
+  mbin_dispatch_init6 gf_vect_dot_prod, gf_vect_dot_prod_base, gf_vect_dot_prod_sse, gf_vect_dot_prod_avx, gf_vect_dot_prod_avx2, gf_vect_dot_prod_avx512
 %else
-	push    rax
-	push    rbx
-	push    rcx
-	push    rdx
-	lea     arg1, [ec_encode_data_update_base WRT_OPT] ; Default
-
-	mov     eax, 1
-	cpuid
-	lea     rbx, [ec_encode_data_update_sse WRT_OPT]
-	test    ecx, FLAG_CPUID1_ECX_SSE4_1
-	cmovne  arg1, rbx
-
-	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
-	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
-	lea	rbx, [ec_encode_data_update_avx WRT_OPT]
-
-	jne	_done_ec_encode_data_update_init
-	mov	rsi, rbx
-
-	;; Try for AVX2
-	xor	ecx, ecx
-	mov	eax, 7
-	cpuid
-	test	ebx, FLAG_CPUID1_EBX_AVX2
-	lea     rbx, [ec_encode_data_update_avx2 WRT_OPT]
-	cmovne	rsi, rbx
-
-	;; Does it have xmm and ymm support
-	xor	ecx, ecx
-	xgetbv
-	and	eax, FLAG_XGETBV_EAX_XMM_YMM
-	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
-	je	_done_ec_encode_data_update_init
-	lea     rsi, [ec_encode_data_update_sse WRT_OPT]
-
-_done_ec_encode_data_update_init:
-	pop     rdx
-	pop     rcx
-	pop     rbx
-	pop     rax
-%endif			;; END 32-bit check
-	mov     [ec_encode_data_update_dispatched], arg1
-	pop     arg1
-	ret
-
-;;;;
-; gf_vect_dot_prod multibinary function
-;;;;
-global gf_vect_dot_prod:function
-gf_vect_dot_prod_mbinit:
-	call    gf_vect_dot_prod_dispatch_init
-
-gf_vect_dot_prod:
-	jmp     wrd_sz [gf_vect_dot_prod_dispatched]
-
-gf_vect_dot_prod_dispatch_init:
-	push    arg1
-	push    arg2
-	push    arg3
-	push    arg4
-	push    arg5
-	lea     arg1, [gf_vect_dot_prod_base WRT_OPT] ; Default
-
-	mov     eax, 1
-	cpuid
-	lea     arg3, [gf_vect_dot_prod_sse WRT_OPT]
-	test    ecx, FLAG_CPUID1_ECX_SSE4_1
-	cmovne  arg1, arg3
-
-	and		ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
-	cmp		ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
-	lea     arg3, [gf_vect_dot_prod_avx WRT_OPT]
-
-	jne     _done_gf_vect_dot_prod_init
-	mov		arg1, arg3
-
-	;; Try for AVX2
-	xor		ecx, ecx
-	mov		eax, 7
-	cpuid
-	test	ebx, FLAG_CPUID1_EBX_AVX2
-	lea     arg3, [gf_vect_dot_prod_avx2 WRT_OPT]
-	cmovne	arg1, arg3
-	;; Does it have xmm and ymm support
-	xor	ecx, ecx
-	xgetbv
-	and	eax, FLAG_XGETBV_EAX_XMM_YMM
-	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
-	je	_done_gf_vect_dot_prod_init
-	lea     arg1, [gf_vect_dot_prod_sse WRT_OPT]
-
-_done_gf_vect_dot_prod_init:
-	pop     arg5
-	pop     arg4
-	pop     arg3
-	pop     arg2
-	mov     [gf_vect_dot_prod_dispatched], arg1
-	pop	arg1
-	ret
-
-;;;;
-; gf_vect_mad multibinary function
-;;;;
-global gf_vect_mad:function
-gf_vect_mad_mbinit:
-	call    gf_vect_mad_dispatch_init
-
-gf_vect_mad:
-	jmp     wrd_sz [gf_vect_mad_dispatched]
-
-gf_vect_mad_dispatch_init:
-	push    arg1
-%ifidn __OUTPUT_FORMAT__, elf32         ;; 32-bit check
-	lea     arg1, [gf_vect_mad_base]
-%else
-	push	rax
-	push	rbx
-	push	rcx
-	push	rdx
-	lea     arg1, [gf_vect_mad_base WRT_OPT] ; Default
-
-	mov     eax, 1
-	cpuid
-	lea     rbx, [gf_vect_mad_sse WRT_OPT]
-	test    ecx, FLAG_CPUID1_ECX_SSE4_1
-	cmovne  arg1, rbx
-
-	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
-	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
-	lea     rbx, [gf_vect_mad_avx WRT_OPT]
-
-	jne     _done_gf_vect_mad_init
-	mov	rsi, rbx
-
-	;; Try for AVX2
-	xor	ecx, ecx
-	mov	eax, 7
-	cpuid
-	test	ebx, FLAG_CPUID1_EBX_AVX2
-	lea     rbx, [gf_vect_mad_avx2 WRT_OPT]
-	cmovne	rsi, rbx
-
-	;; Does it have xmm and ymm support
-	xor	ecx, ecx
-	xgetbv
-	and	eax, FLAG_XGETBV_EAX_XMM_YMM
-	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
-	je	_done_gf_vect_mad_init
-	lea     rsi, [gf_vect_mad_sse WRT_OPT]
-
-_done_gf_vect_mad_init:
-	pop     rdx
-	pop     rcx
-	pop     rbx
-	pop     rax
-%endif			;; END 32-bit check
-	mov     [gf_vect_mad_dispatched], arg1
-	pop	arg1
-	ret
+  mbin_dispatch_init5 ec_encode_data, ec_encode_data_base, ec_encode_data_sse, ec_encode_data_avx, ec_encode_data_avx2
+  mbin_dispatch_init5 ec_encode_data_update, ec_encode_data_update_base, ec_encode_data_update_sse, ec_encode_data_update_avx, ec_encode_data_update_avx2
+  mbin_dispatch_init5 gf_vect_mad, gf_vect_mad_base, gf_vect_mad_sse, gf_vect_mad_avx, gf_vect_mad_avx2
+  mbin_dispatch_init5 gf_vect_dot_prod, gf_vect_dot_prod_base, gf_vect_dot_prod_sse, gf_vect_dot_prod_avx, gf_vect_dot_prod_avx2
+%endif
+%endif

 ;;;       func                 		core, ver, snum
-slversion ec_encode_data,		00,   04,  0133
-slversion gf_vect_mul,			00,   03,  0134
-slversion ec_encode_data_update,	00,   03,  0212
-slversion gf_vect_dot_prod,		00,   03,  0138
-slversion gf_vect_mad,			00,   02,  0213
+slversion ec_encode_data,		00,   06,  0133
+slversion gf_vect_mul,			00,   05,  0134
+slversion ec_encode_data_update,	00,   05,  0212
+slversion gf_vect_dot_prod,		00,   05,  0138
+slversion gf_vect_mad,			00,   04,  0213
--- a/erasure_code/gf_2vect_dot_prod_avx512.asm
+++ b/erasure_code/gf_2vect_dot_prod_avx512.asm
@@ -0,0 +1,245 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_dot_prod_avx512(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define tmp3  r12		; must be saved and restored
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  9*16 + 5*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqa	[rsp + 0*16], xmm6
+	vmovdqa	[rsp + 1*16], xmm7
+	vmovdqa	[rsp + 2*16], xmm8
+	vmovdqa	[rsp + 3*16], xmm9
+	vmovdqa	[rsp + 4*16], xmm10
+	vmovdqa	[rsp + 5*16], xmm11
+	vmovdqa	[rsp + 6*16], xmm12
+	vmovdqa	[rsp + 7*16], xmm13
+	vmovdqa	[rsp + 8*16], xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r13,  9*16 + 1*8
+	save_reg	r14,  9*16 + 2*8
+	save_reg	r15,  9*16 + 3*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	vmovdqa	xmm12, [rsp + 6*16]
+	vmovdqa	xmm13, [rsp + 7*16]
+	vmovdqa	xmm14, [rsp + 8*16]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r13,  [rsp + 9*16 + 1*8]
+	mov	r14,  [rsp + 9*16 + 2*8]
+	mov	r15,  [rsp + 9*16 + 3*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len    arg0
+%define vec    arg1
+%define mul_array arg2
+%define src    arg3
+%define dest1  arg4
+%define ptr    arg5
+%define vec_i  tmp2
+%define dest2  tmp3
+%define pos    return
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+%define xmask0f   zmm8
+%define xgft1_lo  zmm7
+%define xgft1_loy ymm7
+%define xgft1_hi  zmm6
+%define xgft2_lo  zmm5
+%define xgft2_loy ymm5
+%define xgft2_hi  zmm4
+
+%define x0        zmm0
+%define xtmpa     zmm1
+%define xp1       zmm2
+%define xp2       zmm3
+
+default rel
+[bits 64]
+
+section .text
+
+align 16
+global gf_2vect_dot_prod_avx512:function
+func(gf_2vect_dot_prod_avx512)
+	FUNC_SAVE
+	sub	len, 64
+	jl	.return_fail
+
+	xor	pos, pos
+	mov	tmp, 0x0f
+	vpbroadcastb xmask0f, tmp	;Construct mask 0x0f0f0f...
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+.loop64:
+	vpxorq	xp1, xp1, xp1
+	vpxorq	xp2, xp2, xp2
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect:
+	mov	ptr, [src+vec_i]
+	XLDR	x0, [ptr+pos]		;Get next source vector
+	add	vec_i, PS
+
+	vpandq	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpandq	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vmovdqu8 xgft1_loy, [tmp]		;Load array Ax{00}..{0f}, Ax{00}..{f0}
+	vmovdqu8 xgft2_loy, [tmp+vec*(32/PS)]	;Load array Bx{00}..{0f}, Bx{00}..{f0}
+	add	tmp, 32
+
+	vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
+	vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
+	vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55
+	vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00
+
+	vpshufb	xgft1_hi, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xgft1_hi, xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxorq	xp1, xp1, xgft1_hi		;xp1 += partial
+
+	vpshufb	xgft2_hi, xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xgft2_hi, xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxorq	xp2, xp2, xgft2_hi		;xp2 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+
+	add	pos, 64			;Loop on 64 bytes at a time
+	cmp	pos, len
+	jle	.loop64
+
+	lea	tmp, [len + 64]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-64
+	jmp	.loop64		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_gf_2vect_dot_prod_avx512
+no_gf_2vect_dot_prod_avx512:
+%endif
+%endif  ; ifdef HAVE_AS_KNOWS_AVX512
--- a/erasure_code/gf_2vect_mad_avx512.asm
+++ b/erasure_code/gf_2vect_mad_avx512.asm
@@ -0,0 +1,230 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_mad_avx512(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0   rdi
+ %define arg1   rsi
+ %define arg2   rdx
+ %define arg3   rcx
+ %define arg4   r8
+ %define arg5   r9
+ %define tmp    r11
+ %define tmp2   r10
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+ %define arg4   r12
+ %define arg5   r15
+ %define tmp    r11
+ %define tmp2   r10
+ %define return rax
+ %define stack_size  16*9 + 3*8 	; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	vmovdqa	[rsp+16*3],xmm9
+	vmovdqa	[rsp+16*4],xmm10
+	vmovdqa	[rsp+16*5],xmm11
+	vmovdqa	[rsp+16*6],xmm12
+	vmovdqa	[rsp+16*7],xmm13
+	vmovdqa	[rsp+16*8],xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r15,  9*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	vmovdqa	xmm9, [rsp+16*3]
+	vmovdqa	xmm10, [rsp+16*4]
+	vmovdqa	xmm11, [rsp+16*5]
+	vmovdqa	xmm12, [rsp+16*6]
+	vmovdqa	xmm13, [rsp+16*7]
+	vmovdqa	xmm14, [rsp+16*8]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r15,  [rsp + 9*16 + 1*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+
+%define PS    8
+%define len   arg0
+%define len.w arg0.w
+%define vec   arg1
+%define vec_i arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define pos.w return.w
+%define dest2 tmp2
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0        zmm0
+%define xtmpa     zmm1
+%define xtmph1    zmm2
+%define xtmpl1    zmm3
+%define xtmph2    zmm4
+%define xtmpl2    zmm5
+%define xd1       zmm6
+%define xd2       zmm7
+%define xtmpd1    zmm8
+%define xtmpd2    zmm9
+%define xgft1_hi  zmm10
+%define xgft1_lo  zmm11
+%define xgft1_loy ymm11
+%define xgft2_hi  zmm12
+%define xgft2_lo  zmm13
+%define xgft2_loy ymm13
+%define xmask0f   zmm14
+
+align 16
+global gf_2vect_mad_avx512:function
+func(gf_2vect_mad_avx512)
+	FUNC_SAVE
+	sub	len, 64
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp, 0x0f
+	vpbroadcastb xmask0f, tmp	;Construct mask 0x0f0f0f...
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	vmovdqu	xgft1_loy, [tmp]	;Load array Ax{00}..{0f}, Ax{00}..{f0}
+	vmovdqu	xgft2_loy, [tmp+vec]	;Load array Bx{00}..{0f}, Bx{00}..{f0}
+	vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
+	vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
+	vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55
+	vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest1, [dest1]
+	mov	tmp, -1
+	kmovq	k1, tmp
+
+.loop64:
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpandq	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpandq	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph1 {k1}{z}, xgft1_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl1 {k1}{z}, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxorq	xd1, xd1, xtmph1		;xd1 += partial
+
+	vpshufb	xtmph2 {k1}{z}, xgft2_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl2 {k1}{z}, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxorq	xd2, xd2, xtmph2		;xd2 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+
+	add	pos, 64			;Loop on 64 bytes at a time
+	cmp	pos, len
+	jle	.loop64
+
+	lea	tmp, [len + 64]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, (1 << 63)
+	lea	tmp, [len + 64 - 1]
+	and	tmp, 63
+	sarx	pos, pos, tmp
+	kmovq	k1, pos
+	mov	pos, len	;Overlapped offset length-64
+	jmp	.loop64		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_gf_2vect_mad_avx512
+no_gf_2vect_mad_avx512:
+%endif
+%endif  ; ifdef HAVE_AS_KNOWS_AVX512
--- a/erasure_code/gf_3vect_dot_prod_avx512.asm
+++ b/erasure_code/gf_3vect_dot_prod_avx512.asm
@@ -0,0 +1,270 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_dot_prod_avx512(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  9*16 + 5*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqa	[rsp + 0*16], xmm6
+	vmovdqa	[rsp + 1*16], xmm7
+	vmovdqa	[rsp + 2*16], xmm8
+	vmovdqa	[rsp + 3*16], xmm9
+	vmovdqa	[rsp + 4*16], xmm10
+	vmovdqa	[rsp + 5*16], xmm11
+	vmovdqa	[rsp + 6*16], xmm12
+	vmovdqa	[rsp + 7*16], xmm13
+	vmovdqa	[rsp + 8*16], xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r13,  9*16 + 1*8
+	save_reg	r14,  9*16 + 2*8
+	save_reg	r15,  9*16 + 3*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	vmovdqa	xmm12, [rsp + 6*16]
+	vmovdqa	xmm13, [rsp + 7*16]
+	vmovdqa	xmm14, [rsp + 8*16]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r13,  [rsp + 9*16 + 1*8]
+	mov	r14,  [rsp + 9*16 + 2*8]
+	mov	r15,  [rsp + 9*16 + 3*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len    arg0
+%define vec    arg1
+%define mul_array arg2
+%define src    arg3
+%define dest1  arg4
+%define ptr    arg5
+%define vec_i  tmp2
+%define dest2  tmp3
+%define dest3  tmp4
+%define pos    return
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+%define xmask0f   zmm11
+%define xgft1_lo  zmm10
+%define xgft1_loy ymm10
+%define xgft1_hi  zmm9
+%define xgft2_lo  zmm8
+%define xgft2_loy ymm8
+%define xgft2_hi  zmm7
+%define xgft3_lo  zmm6
+%define xgft3_loy ymm6
+%define xgft3_hi  zmm5
+
+%define x0        zmm0
+%define xtmpa     zmm1
+%define xp1       zmm2
+%define xp2       zmm3
+%define xp3       zmm4
+
+default rel
+[bits 64]
+
+section .text
+
+align 16
+global gf_3vect_dot_prod_avx512:function
+func(gf_3vect_dot_prod_avx512)
+	FUNC_SAVE
+	sub	len, 64
+	jl	.return_fail
+
+	xor	pos, pos
+	mov	tmp, 0x0f
+	vpbroadcastb xmask0f, tmp	;Construct mask 0x0f0f0f...
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	mov	dest2, [dest1+PS]
+	mov	dest3, [dest1+2*PS]
+	mov	dest1, [dest1]
+
+.loop64:
+	vpxorq	xp1, xp1, xp1
+	vpxorq	xp2, xp2, xp2
+	vpxorq	xp3, xp3, xp3
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect:
+	mov	ptr, [src+vec_i]
+	XLDR	x0, [ptr+pos]		;Get next source vector
+	add	vec_i, PS
+
+	vpandq	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpandq	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vmovdqu8 xgft1_loy, [tmp]		;Load array Ax{00}..{0f}, Ax{00}..{f0}
+	vmovdqu8 xgft2_loy, [tmp+vec*(32/PS)]	;Load array Bx{00}..{0f}, Bx{00}..{f0}
+	vmovdqu8 xgft3_loy, [tmp+vec*(64/PS)]	;Load array Cx{00}..{0f}, Cx{00}..{f0}
+	add	tmp, 32
+
+	vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
+	vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
+	vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55
+	vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00
+
+	vpshufb	xgft1_hi, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xgft1_hi, xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxorq	xp1, xp1, xgft1_hi		;xp1 += partial
+
+	vpshufb	xgft2_hi, xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xgft2_hi, xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxorq	xp2, xp2, xgft2_hi		;xp2 += partial
+
+	vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55
+	vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00
+
+	vpshufb	xgft3_hi, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xgft3_hi, xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpxorq	xp3, xp3, xgft3_hi		;xp3 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	XSTR	[dest3+pos], xp3
+
+	add	pos, 64			;Loop on 64 bytes at a time
+	cmp	pos, len
+	jle	.loop64
+
+	lea	tmp, [len + 64]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-64
+	jmp	.loop64		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_gf_3vect_dot_prod_avx512
+no_gf_3vect_dot_prod_avx512:
+%endif
+%endif  ; ifdef HAVE_AS_KNOWS_AVX512
--- a/erasure_code/gf_3vect_mad_avx512.asm
+++ b/erasure_code/gf_3vect_mad_avx512.asm
@@ -0,0 +1,247 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_mad_avx512(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0   rdi
+ %define arg1   rsi
+ %define arg2   rdx
+ %define arg3   rcx
+ %define arg4   r8
+ %define arg5   r9
+ %define tmp    r11
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	vmovdqa	[rsp+16*3],xmm9
+	vmovdqa	[rsp+16*4],xmm10
+	vmovdqa	[rsp+16*5],xmm11
+	vmovdqa	[rsp+16*6],xmm12
+	vmovdqa	[rsp+16*7],xmm13
+	vmovdqa	[rsp+16*8],xmm14
+	vmovdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	vmovdqa	xmm9, [rsp+16*3]
+	vmovdqa	xmm10, [rsp+16*4]
+	vmovdqa	xmm11, [rsp+16*5]
+	vmovdqa	xmm12, [rsp+16*6]
+	vmovdqa	xmm13, [rsp+16*7]
+	vmovdqa	xmm14, [rsp+16*8]
+	vmovdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define PS    8
+%define len   arg0
+%define vec   arg1
+%define vec_i arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define dest2 mul_array
+%define dest3 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0        zmm0
+%define xtmpa     zmm1
+%define xtmph1    zmm2
+%define xtmpl1    zmm3
+%define xtmph2    zmm4
+%define xtmpl2    zmm5
+%define xtmph3    zmm6
+%define xtmpl3    zmm7
+%define xgft1_hi  zmm8
+%define xgft1_lo  zmm9
+%define xgft1_loy ymm9
+%define xgft2_hi  zmm10
+%define xgft2_lo  zmm11
+%define xgft2_loy ymm11
+%define xgft3_hi  zmm12
+%define xgft3_lo  zmm13
+%define xgft3_loy ymm13
+%define xd1       zmm14
+%define xd2       zmm15
+%define xd3       zmm16
+%define xmask0f   zmm17
+
+align 16
+global gf_3vect_mad_avx512:function
+func(gf_3vect_mad_avx512)
+	FUNC_SAVE
+	sub	len, 64
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp, 0x0f
+	vpbroadcastb xmask0f, tmp	;Construct mask 0x0f0f0f...
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	vmovdqu	xgft1_loy, [tmp]	;Load array Ax{00}..{0f}, Ax{00}..{f0}
+	vmovdqu	xgft2_loy, [tmp+vec]	;Load array Bx{00}..{0f}, Bx{00}..{f0}
+	vmovdqu	xgft3_loy, [tmp+2*vec]	;Load array Cx{00}..{0f}, Cx{00}..{f0}
+	vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
+	vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
+	vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55
+	vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00
+	vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55
+	vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest3, [dest1+2*PS]	; reuse vec_i
+	mov	dest1, [dest1]
+	mov	tmp, -1
+	kmovq	k1, tmp
+
+.loop64:
+	XLDR	x0, [src+pos]		;Get next source vector
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+	XLDR	xd3, [dest3+pos]	;Get next dest vector
+
+	vpandq	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpandq	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1 {k1}{z}, xgft1_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl1 {k1}{z}, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxorq	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2 {k1}{z}, xgft2_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl2 {k1}{z}, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxorq	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3 {k1}{z}, xgft3_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl3 {k1}{z}, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxorq	xd3, xd3, xtmph3		;xd2 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+
+	add	pos, 64			;Loop on 64 bytes at a time
+	cmp	pos, len
+	jle	.loop64
+
+	lea	tmp, [len + 64]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, (1 << 63)
+	lea	tmp, [len + 64 - 1]
+	and	tmp, 63
+	sarx	pos, pos, tmp
+	kmovq	k1, pos
+	mov	pos, len	;Overlapped offset length-64
+	jmp	.loop64		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_gf_3vect_mad_avx512
+no_gf_3vect_mad_avx512:
+%endif
+%endif  ; ifdef HAVE_AS_KNOWS_AVX512
--- a/erasure_code/gf_4vect_dot_prod_avx512.asm
+++ b/erasure_code/gf_4vect_dot_prod_avx512.asm
@@ -0,0 +1,301 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_dot_prod_avx512(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  9*16 + 7*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqa	[rsp + 0*16], xmm6
+	vmovdqa	[rsp + 1*16], xmm7
+	vmovdqa	[rsp + 2*16], xmm8
+	vmovdqa	[rsp + 3*16], xmm9
+	vmovdqa	[rsp + 4*16], xmm10
+	vmovdqa	[rsp + 5*16], xmm11
+	vmovdqa	[rsp + 6*16], xmm12
+	vmovdqa	[rsp + 7*16], xmm13
+	vmovdqa	[rsp + 8*16], xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r13,  9*16 + 1*8
+	save_reg	r14,  9*16 + 2*8
+	save_reg	r15,  9*16 + 3*8
+	save_reg	rdi,  9*16 + 4*8
+	save_reg	rsi,  9*16 + 5*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	vmovdqa	xmm12, [rsp + 6*16]
+	vmovdqa	xmm13, [rsp + 7*16]
+	vmovdqa	xmm14, [rsp + 8*16]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r13,  [rsp + 9*16 + 1*8]
+	mov	r14,  [rsp + 9*16 + 2*8]
+	mov	r15,  [rsp + 9*16 + 3*8]
+	mov	rdi,  [rsp + 9*16 + 4*8]
+	mov	rsi,  [rsp + 9*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len    arg0
+%define vec    arg1
+%define mul_array arg2
+%define src    arg3
+%define dest1  arg4
+%define ptr    arg5
+%define vec_i  tmp2
+%define dest2  tmp3
+%define dest3  tmp4
+%define dest4  tmp5
+%define vskip3 tmp6
+%define pos    return
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+%define xmask0f   zmm14
+%define xgft1_lo  zmm13
+%define xgft1_loy ymm13
+%define xgft1_hi  zmm12
+%define xgft2_lo  zmm11
+%define xgft2_loy ymm11
+%define xgft2_hi  zmm10
+%define xgft3_lo  zmm9
+%define xgft3_loy ymm9
+%define xgft3_hi  zmm8
+%define xgft4_lo  zmm7
+%define xgft4_loy ymm7
+%define xgft4_hi  zmm6
+
+%define x0        zmm0
+%define xtmpa     zmm1
+%define xp1       zmm2
+%define xp2       zmm3
+%define xp3       zmm4
+%define xp4       zmm5
+
+default rel
+[bits 64]
+
+section .text
+
+align 16
+global gf_4vect_dot_prod_avx512:function
+func(gf_4vect_dot_prod_avx512)
+	FUNC_SAVE
+	sub	len, 64
+	jl	.return_fail
+
+	xor	pos, pos
+	mov	tmp, 0x0f
+	vpbroadcastb xmask0f, tmp	;Construct mask 0x0f0f0f...
+	mov	vskip3, vec
+	imul	vskip3, 96
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	mov	dest2, [dest1+PS]
+	mov	dest3, [dest1+2*PS]
+	mov	dest4, [dest1+3*PS]
+	mov	dest1, [dest1]
+
+.loop64:
+	vpxorq	xp1, xp1, xp1
+	vpxorq	xp2, xp2, xp2
+	vpxorq	xp3, xp3, xp3
+	vpxorq	xp4, xp4, xp4
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect:
+	mov	ptr, [src+vec_i]
+	XLDR	x0, [ptr+pos]		;Get next source vector
+	add	vec_i, PS
+
+	vpandq	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpandq	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vmovdqu8 xgft1_loy, [tmp]		;Load array Ax{00}..{0f}, Ax{00}..{f0}
+	vmovdqu8 xgft2_loy, [tmp+vec*(32/PS)]	;Load array Bx{00}..{0f}, Bx{00}..{f0}
+	vmovdqu8 xgft3_loy, [tmp+vec*(64/PS)]	;Load array Cx{00}..{0f}, Cx{00}..{f0}
+	vmovdqu8 xgft4_loy, [tmp+vskip3]	;Load array Dx{00}..{0f}, Dx{00}..{f0}
+	add	tmp, 32
+
+	vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
+	vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
+	vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55
+	vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00
+
+	vpshufb	xgft1_hi, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xgft1_hi, xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxorq	xp1, xp1, xgft1_hi		;xp1 += partial
+
+	vpshufb	xgft2_hi, xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xgft2_hi, xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxorq	xp2, xp2, xgft2_hi		;xp2 += partial
+
+	vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55
+	vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00
+	vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55
+	vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00
+
+	vpshufb	xgft3_hi, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xgft3_hi, xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpxorq	xp3, xp3, xgft3_hi		;xp3 += partial
+
+	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa 	;Lookup mul table of low nibble
+	vpxorq	xgft4_hi, xgft4_hi, xgft4_lo	;GF add high and low partials
+	vpxorq	xp4, xp4, xgft4_hi		;xp4 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	XSTR	[dest3+pos], xp3
+	XSTR	[dest4+pos], xp4
+
+	add	pos, 64			;Loop on 64 bytes at a time
+	cmp	pos, len
+	jle	.loop64
+
+	lea	tmp, [len + 64]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-64
+	jmp	.loop64		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_gf_4vect_dot_prod_avx512
+no_gf_4vect_dot_prod_avx512:
+%endif
+%endif  ; ifdef HAVE_AS_KNOWS_AVX512
--- a/erasure_code/gf_4vect_mad_avx512.asm
+++ b/erasure_code/gf_4vect_mad_avx512.asm
@@ -0,0 +1,267 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_mad_avx512(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0   rdi
+ %define arg1   rsi
+ %define arg2   rdx
+ %define arg3   rcx
+ %define arg4   r8
+ %define arg5   r9
+ %define tmp    r11
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+ %define arg4   r12
+ %define arg5   r15
+ %define tmp    r11
+ %define return rax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+%endif
+
+%define PS    8
+%define len   arg0
+%define vec   arg1
+%define vec_i arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define dest2 mul_array
+%define dest3 vec
+%define dest4 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+[bits 64]
+section .text
+
+%define x0        zmm0
+%define xtmpa     zmm1
+%define xtmpl1    zmm2
+%define xtmph1    zmm3
+%define xtmph2    zmm4
+%define xtmph3    zmm5
+%define xtmph4    zmm6
+%define xgft1_hi  zmm7
+%define xgft1_lo  zmm8
+%define xgft1_loy ymm8
+%define xgft2_hi  zmm9
+%define xgft2_lo  zmm10
+%define xgft2_loy ymm10
+%define xgft3_hi  zmm11
+%define xgft3_lo  zmm12
+%define xgft3_loy ymm12
+%define xgft4_hi  zmm13
+%define xgft4_lo  zmm14
+%define xgft4_loy ymm14
+%define xd1       zmm15
+%define xd2       zmm16
+%define xd3       zmm17
+%define xd4       zmm18
+%define xmask0f   zmm19
+%define xtmpl2    zmm20
+%define xtmpl3    zmm21
+%define xtmpl4    zmm22
+%define xtmpl5    zmm23
+
+align 16
+global gf_4vect_mad_avx512:function
+func(gf_4vect_mad_avx512)
+	FUNC_SAVE
+	sub	len, 64
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp, 0x0f
+	vpbroadcastb xmask0f, tmp	;Construct mask 0x0f0f0f...
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5			;Multiply by 32
+	lea	tmp, [mul_array + vec_i]
+	vmovdqu	xgft1_loy, [tmp]	;Load array Ax{00}..{0f}, Ax{00}..{f0}
+	vmovdqu	xgft2_loy, [tmp+vec]	;Load array Bx{00}..{0f}, Bx{00}..{f0}
+	vmovdqu	xgft3_loy, [tmp+2*vec]	;Load array Cx{00}..{0f}, Cx{00}..{f0}
+	add	tmp, vec
+	vmovdqu	xgft4_loy, [tmp+2*vec]	;Load array Dx{00}..{0f}, Dx{00}..{f0}
+	vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
+	vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
+	vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55
+	vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00
+	vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55
+	vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00
+	vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55
+	vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00
+	mov	dest2, [dest1+PS]		; reuse mul_array
+	mov	dest3, [dest1+2*PS]		; reuse vec
+	mov	dest4, [dest1+3*PS]		; reuse vec_i
+	mov	dest1, [dest1]
+	mov	tmp, -1
+	kmovq	k1, tmp
+
+.loop64:
+	XLDR	x0, [src+pos]		;Get next source vector
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+	XLDR	xd3, [dest3+pos]	;Get next dest vector
+	XLDR	xd4, [dest4+pos]	;reuse xtmpl1. Get next dest vector
+
+	vpandq	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpandq	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1 {k1}{z}, xgft1_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl1 {k1}{z}, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxorq	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2 {k1}{z}, xgft2_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl2 {k1}{z}, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxorq	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3 {k1}{z}, xgft3_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl3 {k1}{z}, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxorq	xd3, xd3, xtmph3		;xd2 += partial
+
+	; dest4
+	vpshufb	xtmph4 {k1}{z}, xgft4_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl4 {k1}{z}, xgft4_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xtmph4, xtmph4, xtmpl4		;GF add high and low partials
+	vpxorq	xd4, xd4, xtmph4		;xd2 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+	XSTR	[dest4+pos], xd4
+
+	add	pos, 64			;Loop on 64 bytes at a time
+	cmp	pos, len
+	jle	.loop64
+
+	lea	tmp, [len + 64]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, (1 << 63)
+	lea	tmp, [len + 64 - 1]
+	and	tmp, 63
+	sarx	pos, pos, tmp
+	kmovq	k1, pos
+	mov	pos, len	;Overlapped offset length-64
+	jmp	.loop64		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_gf_4vect_mad_avx512
+no_gf_4vect_mad_avx512:
+%endif
+%endif  ; ifdef HAVE_AS_KNOWS_AVX512
--- a/erasure_code/gf_vect_dot_prod_avx512.asm
+++ b/erasure_code/gf_vect_dot_prod_avx512.asm
@@ -0,0 +1,240 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_dot_prod_avx512(len, vec, *g_tbls, **buffs, *dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp2   r10
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  0*16 + 3*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r15,  9*16 + 3*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r15,  [rsp + 9*16 + 3*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+
+%define len    arg0
+%define vec    arg1
+%define mul_array arg2
+%define src    arg3
+%define dest1  arg4
+%define ptr    arg5
+%define vec_i  tmp2
+%define pos    return
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+%define xmask0f   zmm5
+%define xgft1_lo  zmm4
+%define xgft1_loy ymm4
+%define xgft1_hi  zmm3
+%define x0        zmm0
+%define xgft1_loy ymm4
+%define x0y       ymm0
+%define xtmpa     zmm1
+%define xp1       zmm2
+%define xp1y      ymm2
+
+default rel
+[bits 64]
+section .text
+
+align 16
+global gf_vect_dot_prod_avx512:function
+func(gf_vect_dot_prod_avx512)
+	FUNC_SAVE
+	xor	pos, pos
+	mov	tmp, 0x0f
+	vpbroadcastb xmask0f, tmp	;Construct mask 0x0f0f0f...
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	sub	len, 64
+	jl	.len_lt_64
+
+.loop64:
+	vpxorq	xp1, xp1, xp1
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect:
+	mov	ptr, [src+vec_i]
+	XLDR	x0, [ptr+pos]		;Get next source vector
+	add	vec_i, PS
+
+	vpandq	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpandq	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vmovdqu8 xgft1_loy, [tmp]		;Load array Ax{00}..{0f}, Ax{00}..{f0}
+	add	tmp, 32
+
+	vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
+	vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
+
+	vpshufb	xgft1_hi, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xgft1_hi, xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxorq	xp1, xp1, xgft1_hi		;xp1 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	XSTR	[dest1+pos], xp1
+
+	add	pos, 64			;Loop on 64 bytes at a time
+	cmp	pos, len
+	jle	.loop64
+
+	lea	tmp, [len + 64]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-64
+	jmp	.loop64		;Do one more overlap pass
+
+
+.len_lt_64:			; 32-byte version
+	add	len, 32
+	jl	.return_fail
+
+.loop32:
+	vpxorq	xp1, xp1, xp1
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect2:
+	mov	ptr, [src+vec_i]
+	XLDR	x0y, [ptr+pos]		;Get next source vector 32B
+	add	vec_i, PS
+	vpsraw	xtmpa, x0, 4		;Shift to put high nibble into bits 4-0
+	vshufi64x2 x0, x0, xtmpa, 0x44	;put x0 = xl:xh
+	vpandq	x0, x0, xmask0f		;Mask bits 4-0
+	vmovdqu8 xgft1_loy, [tmp]	;Load array Ax{00}..{0f}, Ax{00}..{f0}
+	add	tmp, 32
+	vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x50	;=AlAh:AlAh
+	vpshufb	   xgft1_lo, xgft1_lo, x0		;Lookup mul table
+	vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x0e	;=xh:
+	vpxorq	xgft1_hi, xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxorq	xp1, xp1, xgft1_hi		;xp1 += partial
+	cmp	vec_i, vec
+	jl	.next_vect2
+
+	XSTR	[dest1+pos], xp1y
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-32
+	jmp	.loop32		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_gf_vect_dot_prod_avx512
+no_gf_vect_dot_prod_avx512:
+%endif
+%endif  ; ifdef HAVE_AS_KNOWS_AVX512
--- a/erasure_code/gf_vect_mad_avx512.asm
+++ b/erasure_code/gf_vect_mad_avx512.asm
@@ -0,0 +1,193 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mad_avx512(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+ %define arg4   r12 		; must be saved and loaded
+ %define arg5   r15
+ %define tmp    r11
+ %define return rax
+ %define PS 8
+ %define stack_size 16*3 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	save_reg	r12,  3*16 + 0*8
+	save_reg	r15,  3*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	mov	r12,  [rsp + 3*16 + 0*8]
+	mov	r15,  [rsp + 3*16 + 1*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+;;; gf_vect_mad_avx512(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define vec   arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest  arg5
+%define pos   return
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu8
+ %define XSTR vmovdqu8
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define x0       zmm0
+%define xtmpa    zmm1
+%define xtmph    zmm2
+%define xtmpl    zmm3
+%define xd       zmm4
+%define xtmpd    zmm5
+%define xgft_hi  zmm6
+%define xgft_lo  zmm7
+%define xgft_loy ymm7
+%define xmask0f  zmm8
+
+align 16
+global gf_vect_mad_avx512:function
+func(gf_vect_mad_avx512)
+	FUNC_SAVE
+	sub	len, 64
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp, 0x0f
+	vpbroadcastb xmask0f, tmp	;Construct mask 0x0f0f0f...
+	sal	vec_i, 5		;Multiply by 32
+	vmovdqu8 xgft_loy, [vec_i+mul_array]	;Load array Cx{00}..{0f}, Cx{00}..{f0}
+	vshufi64x2 xgft_hi, xgft_lo, xgft_lo, 0x55
+	vshufi64x2 xgft_lo, xgft_lo, xgft_lo, 0x00
+	mov	tmp, -1
+	kmovq	k1, tmp
+
+.loop64:
+	XLDR	xd, [dest+pos]		;Get next dest vector
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpandq	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpandq	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph {k1}{z}, xgft_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl {k1}{z}, xgft_lo, xtmpa	;Lookup mul table of low nibble
+	vpxorq	xtmph, xtmph, xtmpl	;GF add high and low partials
+	vpxorq	xd, xd, xtmph		;xd += partial
+
+	XSTR	[dest+pos], xd
+	add	pos, 64			;Loop on 64 bytes at a time
+	cmp	pos, len
+	jle	.loop64
+
+	lea	tmp, [len + 64]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, (1 << 63)
+	lea	tmp, [len + 64 - 1]
+	and	tmp, 63
+	sarx	pos, pos, tmp
+	kmovq	k1, pos
+	mov	pos, len	;Overlapped offset length-64
+	jmp	.loop64		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_gf_vect_mad_avx512
+no_gf_vect_mad_avx512:
+%endif
+%endif  ; ifdef HAVE_AS_KNOWS_AVX512
--- a/include/multibinary.asm
+++ b/include/multibinary.asm
@@ -0,0 +1,271 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _MULTIBINARY_ASM_
+%define _MULTIBINARY_ASM_
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ %define mbin_def_ptr	dd
+ %define mbin_ptr_sz	dword
+ %define mbin_rdi	edi
+ %define mbin_rsi	esi
+ %define mbin_rax	eax
+ %define mbin_rbx	ebx
+ %define mbin_rcx	ecx
+ %define mbin_rdx	edx
+%else
+ %define mbin_def_ptr	dq
+ %define mbin_ptr_sz	qword
+ %define mbin_rdi	rdi
+ %define mbin_rsi	rsi
+ %define mbin_rax	rax
+ %define mbin_rbx	rbx
+ %define mbin_rcx	rcx
+ %define mbin_rdx	rdx
+%endif
+
+;;;;
+; multibinary macro:
+;   creates the visable entry point that uses HW optimized call pointer
+;   creates the init of the HW optimized call pointer
+;;;;
+%macro mbin_interface 1
+	;;;;
+	; *_dispatched is defaulted to *_mbinit and replaced on first call.
+	; Therefore, *_dispatch_init is only executed on first call.
+	;;;;
+	section .data
+	%1_dispatched:
+		mbin_def_ptr	%1_mbinit
+
+	section .text
+	global %1:function
+	%1_mbinit:
+		;;; only called the first time to setup hardware match
+		call	%1_dispatch_init
+		;;; falls thru to execute the hw optimized code
+	%1:
+		jmp	mbin_ptr_sz [%1_dispatched]
+%endmacro
+
+;;;;;
+; mbin_dispatch_init parameters
+; Use this function when SSE/00/01 is a minimum requirement
+; 1-> function name
+; 2-> SSE/00/01 optimized function used as base
+; 3-> AVX or AVX/02 opt func
+; 4-> AVX2 or AVX/04 opt func
+;;;;;
+%macro mbin_dispatch_init 4
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
+
+		mov	eax, 1
+		cpuid
+		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		lea	mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
+		jne	_%1_init_done ; AVX is not available so end
+		mov	mbin_rsi, mbin_rbx
+
+		;; Try for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
+		cmovne	mbin_rsi, mbin_rbx
+
+		;; Does it have xmm and ymm support
+		xor	ecx, ecx
+		xgetbv
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		je	_%1_init_done
+		lea	mbin_rsi, [%2 WRT_OPT]
+
+	_%1_init_done:
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init2 parameters
+;  Cases where only base functions are available
+; 1-> function name
+; 2-> base function
+;;;;;
+%macro mbin_dispatch_init2 2
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init5 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+;;;;;
+%macro mbin_dispatch_init5 5
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+		mov	eax, 1
+		cpuid
+		; Test for SSE4.1
+		test	ecx, FLAG_CPUID1_ECX_SSE4_1
+		lea	mbin_rbx, [%3 WRT_OPT] ; SSE opt func
+		cmovne	mbin_rsi, mbin_rbx
+
+		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func
+		jne	_%1_init_done ; AVX is not available so end
+		mov	mbin_rsi, mbin_rbx
+
+		;; Try for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		lea	mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func
+		cmovne	mbin_rsi, mbin_rbx
+
+		;; Does it have xmm and ymm support
+		xor	ecx, ecx
+		xgetbv
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		je	_%1_init_done
+		lea	mbin_rsi, [%3 WRT_OPT]
+
+	_%1_init_done:
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init6 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+;;;;;
+%macro mbin_dispatch_init6 6
+	section .text
+	%1_dispatch_init:
+		push	mbin_rsi
+		push	mbin_rax
+		push	mbin_rbx
+		push	mbin_rcx
+		push	mbin_rdx
+		push	mbin_rdi
+		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+		mov	eax, 1
+		cpuid
+		mov	ebx, ecx ; save cpuid1.ecx
+		test	ecx, FLAG_CPUID1_ECX_SSE4_1
+		je	_%1_init_done	  ; Use base function if no SSE4_1
+		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+		;; Test for XMM_YMM support/AVX
+		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
+		je	_%1_init_done
+		xor	ecx, ecx
+		xgetbv	; xcr -> edx:eax
+		mov	edi, eax	  ; save xgetvb.eax
+
+		and	eax, FLAG_XGETBV_EAX_XMM_YMM
+		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+		jne	_%1_init_done
+		test	ebx, FLAG_CPUID1_ECX_AVX
+		je	_%1_init_done
+		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+		;; Test for AVX2
+		xor	ecx, ecx
+		mov	eax, 7
+		cpuid
+		test	ebx, FLAG_CPUID7_EBX_AVX2
+		je	_%1_init_done		; No AVX2 possible
+		lea	mbin_rsi, [%5 WRT_OPT] 	; AVX2/04 opt func
+
+		;; Test for AVX512
+		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
+		jne	_%1_init_done	  ; No AVX512 possible
+		and	ebx, FLAGS_CPUID7_ECX_AVX512_G1
+		cmp	ebx, FLAGS_CPUID7_ECX_AVX512_G1
+		lea	mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+		cmove	mbin_rsi, mbin_rbx
+
+	_%1_init_done:
+		pop	mbin_rdi
+		pop	mbin_rdx
+		pop	mbin_rcx
+		pop	mbin_rbx
+		pop	mbin_rax
+		mov	[%1_dispatched], mbin_rsi
+		pop	mbin_rsi
+		ret
+%endmacro
+
+%endif ; ifndef _MULTIBINARY_ASM_
--- a/include/reg_sizes.asm
+++ b/include/reg_sizes.asm
@@ -30,6 +30,14 @@
 %ifndef _REG_SIZES_ASM_
 %define _REG_SIZES_ASM_

+%ifdef __NASM_VER__
+%ifidn __OUTPUT_FORMAT__, win64
+%error nasm not supported in windows
+%else
+%define endproc_frame
+%endif
+%endif
+
 %define EFLAGS_HAS_CPUID        (1<<21)
 %define FLAG_CPUID1_ECX_CLMUL   (1<<1)
 %define FLAG_CPUID1_EDX_SSE2    (1<<26)
@@ -41,7 +49,24 @@
 %define FLAG_CPUID1_ECX_OSXSAVE (1<<27)
 %define FLAG_CPUID1_ECX_AVX     (1<<28)
 %define FLAG_CPUID1_EBX_AVX2    (1<<5)
-%define FLAG_XGETBV_EAX_XMM_YMM	0x6
+
+%define FLAG_CPUID7_EBX_AVX2           (1<<5)
+%define FLAG_CPUID7_EBX_AVX512F        (1<<16)
+%define FLAG_CPUID7_EBX_AVX512DQ       (1<<17)
+%define FLAG_CPUID7_EBX_AVX512IFMA     (1<<21)
+%define FLAG_CPUID7_EBX_AVX512PF       (1<<26)
+%define FLAG_CPUID7_EBX_AVX512ER       (1<<27)
+%define FLAG_CPUID7_EBX_AVX512CD       (1<<28)
+%define FLAG_CPUID7_EBX_AVX512BW       (1<<30)
+%define FLAG_CPUID7_EBX_AVX512VL       (1<<31)
+%define FLAG_CPUID7_ECX_AVX512VBMI     (1<<1)
+
+%define FLAGS_CPUID7_ECX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ)
+
+%define FLAG_XGETBV_EAX_XMM            (1<<1)
+%define FLAG_XGETBV_EAX_YMM            (1<<2)
+%define FLAG_XGETBV_EAX_XMM_YMM        0x6
+%define FLAG_XGETBV_EAX_ZMM_OPM        0xe0

 %define FLAG_CPUID1_EAX_AVOTON 0x000406d0