Initial commit isa-l v2.14.1

Signed-off-by: Greg Tucker <greg.b.tucker@intel.com>
2024-12-12 09:23:50 +01:00 · 2015-10-22 14:54:34 -07:00 · 2015-10-22 14:54:34 -07:00 · 00c1efc109
commit 00c1efc109
96 changed files with 35658 additions and 0 deletions
--- a/About_bsd.txt
+++ b/About_bsd.txt
@ -0,0 +1,26 @@
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/Makefile.am
+++ b/Makefile.am
@ -0,0 +1,100 @@
+EXTRA_DIST = autogen.sh Makefile.unx make.inc Makefile.nmake isa-l.def About_bsd.txt
+CLEANFILES =
+LDADD =
+AM_MAKEFLAGS = --no-print-directory
+noinst_HEADERS =
+pkginclude_HEADERS = include/test.h
+noinst_LTLIBRARIES =
+INCLUDE = -I $(srcdir)/include
+AM_CFLAGS = ${my_CFLAGS} ${INCLUDE} ${D}
+
+lsrc=
+extern_hdrs=
+other_src=
+check_tests=
+unit_tests=
+perf_tests=
+unit_tests_extra=
+perf_tests_extra=
+examples=
+other_tests=
+lsrc32=
+unit_tests32=
+perf_tests32=
+
+# Include units
+include erasure_code/Makefile.am
+
+# LIB version info not necessarily the same as package version
+LIBISAL_CURRENT=2
+LIBISAL_REVISION=14
+LIBISAL_AGE=0
+
+lib_LTLIBRARIES = libisal.la
+pkginclude_HEADERS += $(sort ${extern_hdrs})
+libisal_la_SOURCES = ${lsrc}
+nobase_include_HEADERS = isa-l.h
+libisal_la_LDFLAGS = $(AM_LDFLAGS) \
+	-version-info $(LIBISAL_CURRENT):$(LIBISAL_REVISION):$(LIBISAL_AGE)
+libisal_la_LIBADD = ${noinst_LTLIBRARIES}
+
+EXTRA_DIST += ${other_src}
+EXTRA_DIST += Release_notes.txt
+
+# For tests
+LDADD += libisal.la
+check_PROGRAMS = ${check_tests}
+TESTS = ${check_tests}
+
+# For additional tests
+EXTRA_PROGRAMS = ${unit_tests}
+EXTRA_PROGRAMS += ${perf_tests}
+EXTRA_PROGRAMS += ${other_tests}
+EXTRA_PROGRAMS += ${examples}
+CLEANFILES += ${EXTRA_PROGRAMS}
+
+perfs: ${perf_tests}
+tests: ${unit_tests}
+other: ${other_tests}
+perf: $(addsuffix .run,$(perf_tests))
+ex: ${examples}
+test: $(addsuffix .run,$(unit_tests))
+
+# Build rule to run tests
+%.run: %
+	$<
+	@echo Completed run: $<
+
+# Support for yasm
+CCAS = ${srcdir}/tools/yasm-filter.sh
+EXTRA_DIST += tools/yasm-filter.sh
+AM_CCASFLAGS = ${yasm_args} ${INCLUDE}
+
+.asm.s:
+	@echo "  MKTMP   " $@;
+	@cp $< $@
+
+# Generate isa-l.h
+BUILT_SOURCES = isa-l.h
+CLEANFILES += isa-l.h
+isa-l.h:
+	@echo 'Building $@'
+	@echo ''			>> $@
+	@echo '#ifndef _ISAL_H_'	>> $@
+	@echo '#define _ISAL_H_'	>> $@
+	@echo ''			>> $@
+	@echo '#define.ISAL_MAJOR_VERSION.${VERSION}' |  ${AWK} -F . '{print $$1, $$2, $$3}' >> $@
+	@echo '#define.ISAL_MINOR_VERSION.${VERSION}' |  ${AWK} -F . '{print $$1, $$2, $$4}' >> $@
+	@echo '#define.ISAL_PATCH_VERSION.${VERSION}' |  ${AWK} -F . '{print $$1, $$2, $$5}' >> $@
+	@echo '#define ISAL_MAKE_VERSION(maj, min, patch)  ((maj) * 0x10000 + (min) * 0x100 + (patch))' >> $@
+	@echo '#define ISAL_VERSION ISAL_MAKE_VERSION(ISAL_MAJOR_VERSION, ISAL_MINOR_VERSION, ISAL_PATCH_VERSION)' >> $@
+	@echo ''			>> $@
+	@for unit in ${extern_hdrs}; do echo "#include <isa-l/$$unit>" | sed -e 's;include/;;' >> $@; done
+	@echo '#endif //_ISAL_H_'	>> $@
+
+
+license   = bsd
+licc      = $(srcdir)/doc/license_$(license)_c.txt
+lica      = $(srcdir)/doc/license_$(license)_asm.txt
+licm      = $(srcdir)/doc/license_$(license)_make.txt
+
--- a/Makefile.nmake
+++ b/Makefile.nmake
@ -0,0 +1,88 @@
+########################################################################
+#  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions 
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+objs = bin\ec_base.obj bin\ec_highlevel_func.obj bin\ec_multibinary.obj bin\gf_2vect_dot_prod_avx.obj bin\gf_2vect_dot_prod_avx2.obj bin\gf_2vect_dot_prod_sse.obj bin\gf_2vect_mad_avx.obj bin\gf_2vect_mad_avx2.obj bin\gf_2vect_mad_sse.obj bin\gf_3vect_dot_prod_avx.obj bin\gf_3vect_dot_prod_avx2.obj bin\gf_3vect_dot_prod_sse.obj bin\gf_3vect_mad_avx.obj bin\gf_3vect_mad_avx2.obj bin\gf_3vect_mad_sse.obj bin\gf_4vect_dot_prod_avx.obj bin\gf_4vect_dot_prod_avx2.obj bin\gf_4vect_dot_prod_sse.obj bin\gf_4vect_mad_avx.obj bin\gf_4vect_mad_avx2.obj bin\gf_4vect_mad_sse.obj bin\gf_5vect_dot_prod_avx.obj bin\gf_5vect_dot_prod_avx2.obj bin\gf_5vect_dot_prod_sse.obj bin\gf_5vect_mad_avx.obj bin\gf_5vect_mad_avx2.obj bin\gf_5vect_mad_sse.obj bin\gf_6vect_dot_prod_avx.obj bin\gf_6vect_dot_prod_avx2.obj bin\gf_6vect_dot_prod_sse.obj bin\gf_6vect_mad_avx.obj bin\gf_6vect_mad_avx2.obj bin\gf_6vect_mad_sse.obj bin\gf_vect_dot_prod_avx.obj bin\gf_vect_dot_prod_avx2.obj bin\gf_vect_dot_prod_sse.obj bin\gf_vect_mad_avx.obj bin\gf_vect_mad_avx2.obj bin\gf_vect_mad_sse.obj bin\gf_vect_mul_avx.obj bin\gf_vect_mul_sse.obj
+
+INCLUDES  = -I. -Ierasure_code -Iinclude
+LINKFLAGS = /nologo
+CFLAGS   = -O2 -D NDEBUG /nologo -D_USE_MATH_DEFINES -Qstd=c99 $(INCLUDES) $(D)
+AFLAGS   = -f win64 $(INCLUDES) $(D)
+CC       = icl
+AS       = yasm
+
+lib: bin static dll
+static: bin isa-l_static.lib
+dll: bin isa-l.dll
+
+bin: ; -mkdir $@
+
+isa-l_static.lib: $(objs)
+	lib -out:$@ $?
+
+isa-l.dll: $(objs)
+	link -out:$@ -dll -def:isa-l.def $?
+
+{erasure_code}.c.obj:
+	$(CC) $(CFLAGS) /c -Fo$@ $?
+{erasure_code}.asm.obj:
+	$(AS) $(AFLAGS) -o $@ $?
+
+
+
+.obj.exe:
+	link /out:$@ $(LINKFLAGS) isa-l.lib $?
+
+# Check tests
+checks = erasure_code_test.exe erasure_code_update_test.exe gf_inverse_test.exe gf_vect_mul_test.exe
+
+checks: lib $(checks)
+$(checks): $(@B).obj
+check: $(checks)
+	!$?
+
+# Unit tests
+tests = erasure_code_base_test.exe erasure_code_sse_test.exe gf_2vect_dot_prod_sse_test.exe gf_3vect_dot_prod_sse_test.exe gf_4vect_dot_prod_sse_test.exe gf_5vect_dot_prod_sse_test.exe gf_6vect_dot_prod_sse_test.exe gf_vect_dot_prod_avx_test.exe gf_vect_dot_prod_base_test.exe gf_vect_dot_prod_sse_test.exe gf_vect_dot_prod_test.exe gf_vect_mad_test.exe gf_vect_mul_avx_test.exe gf_vect_mul_base_test.exe gf_vect_mul_sse_test.exe
+
+tests: lib $(tests)
+$(tests): $(@B).obj
+
+# Performance tests
+perfs = erasure_code_base_perf.exe erasure_code_perf.exe erasure_code_sse_perf.exe erasure_code_update_perf.exe gf_2vect_dot_prod_sse_perf.exe gf_3vect_dot_prod_sse_perf.exe gf_4vect_dot_prod_sse_perf.exe gf_5vect_dot_prod_sse_perf.exe gf_6vect_dot_prod_sse_perf.exe gf_vect_dot_prod_1tbl.exe gf_vect_dot_prod_avx_perf.exe gf_vect_dot_prod_perf.exe gf_vect_dot_prod_sse_perf.exe gf_vect_mad_perf.exe gf_vect_mul_avx_perf.exe gf_vect_mul_perf.exe gf_vect_mul_sse_perf.exe
+
+perfs: lib $(perfs)
+$(perfs): $(@B).obj
+
+clean:
+	-if exist *.obj del *.obj
+	-if exist bin\*.obj del bin\*.obj
+	-if exist isa-l_static.lib del isa-l_static.lib
+	-if exist *.exe del *.exe
+	-if exist isa-l.lib del isa-l.lib
+	-if exist isa-l.dll del isa-l.dll
+
--- a/Makefile.unx
+++ b/Makefile.unx
@ -0,0 +1,41 @@
+########################################################################
+#  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions 
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+units     = erasure_code
+
+default: lib
+
+include $(foreach unit,$(units), $(unit)/Makefile.am)
+
+# Override individual lib names to make one inclusive library.
+lib_name := bin/isa-l.a
+
+include make.inc
+
+VPATH = . $(units) include
--- a/19
+++ b/19
@ -0,0 +1,19 @@
+=================================================
+Intel(R) Intelligent Storage Acceleration Library
+=================================================
+
+Build Prerequisites
+===================
+
+ISA-L requires yasm version 1.2 or later.
+
+Building ISA-L
+==============
+
+To build and install the library it is usually sufficient to run the following.
+
+   ./configure
+   make
+   sudo make install
+
+Other targets include: make check, make tests and make perfs.
--- a/Release_notes.txt
+++ b/Release_notes.txt
@ -0,0 +1,74 @@
+=============================================================================
+v2.14 Intel Intelligent Storage Acceleration Library Release Notes
+      Open Source Version
+=============================================================================
+
+=============================================================================
+RELEASE NOTE CONTENTS
+=============================================================================
+1. KNOWN ISSUES
+2. FIXED ISSUES
+3. CHANGE LOG & FEATURES ADDED
+
+=============================================================================
+1.  KNOWN ISSUES
+=============================================================================
+
+* Only erasure code unit included in open source version at this time.
+
+* Perf tests do not run in Windows environment.
+
+* 32-bit lib is not supported in Windows.
+
+=============================================================================
+2. FIXED ISSUES
+=============================================================================
+v2.14
+
+* Building in unit directories is no longer supported removing the issue of
+  leftover object files causing the top-level make build to fail.
+
+v2.10
+
+* Fix for windows register save overlap in gf_{3-6}vect_dot_prod_sse.asm. Only
+  affects windows versions of erasure code.  GP register saves/restore were
+  pushed to same stack area as XMM.
+
+=============================================================================
+3. CHANGE LOG & FEATURES ADDED 
+=============================================================================
+v2.14
+
+* Autoconf and autotools build allows easier porting to additional systems.
+  Previous make system still available to embedded users with Makefile.unx.
+
+* Includes update for building on Mac OS X/darwin systems. Add --target=darwin
+  to ./configure step.
+
+v2.13
+
+* Erasure code improvments
+  - 32-bit port of optimized gf_vect_dot_prod() functions.  This makes
+    ec_encode_data() functions much faster on 32-bit processors.
+  - Avoton performance improvements.  Performance on Avoton for
+    gf_vect_dot_prod() and ec_encode_data() can improve by as much as 20%.
+
+v2.11
+
+* Incremental erasure code.  New functions added to erasure code to handle
+  single source update of code blocks.  The function ec_encode_data_update()
+  works with parameters similar to ec_encode_data() but are called incrementally
+  with each source block.  These versions are useful when source blocks are not
+  all available at once.
+
+v2.10
+
+* Erasure code updates
+  - New AVX and AVX2 support functions.
+  - Changes min len requirement on gf_vect_dot_prod() to 32 from 16.
+  - Tests include both source and parity recovery with ec_encode_data().
+  - New encoding examples with Vandermonde or Cauchy matrix.
+
+v2.8
+
+* First open release of erasure code unit that is part of ISA-L.
--- a/autogen.sh
+++ b/autogen.sh
@ -0,0 +1,17 @@
+#!/bin/sh -e
+
+autoreconf --install --symlink -f
+
+libdir() {
+        echo $(cd $1/$(gcc -print-multi-os-directory); pwd)
+}
+
+args="--prefix=/usr --libdir=$(libdir /usr/lib)"
+
+echo
+echo "----------------------------------------------------------------"
+echo "Initialized build system. For a common configuration please run:"
+echo "----------------------------------------------------------------"
+echo
+echo "./configure $args"
+echo
--- a/configure.ac
+++ b/configure.ac
@ -0,0 +1,112 @@
+#                                               -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ(2.69)
+AC_INIT([libisal],
+        [2.14.0],
+        [sg.support.isal@intel.com],
+        [isa-l],
+        [http://01.org/storage-acceleration-library])
+AC_CONFIG_SRCDIR([])
+AC_CONFIG_AUX_DIR([build-aux])
+AM_INIT_AUTOMAKE([
+	foreign
+	1.11
+	-Wall
+	-Wno-portability
+	silent-rules
+	tar-pax
+	no-dist-gzip
+	dist-xz
+	subdir-objects
+])
+AM_PROG_AS
+
+# Check for programs
+AC_PROG_CC_STDC
+AC_USE_SYSTEM_EXTENSIONS
+AM_SILENT_RULES([yes])
+LT_INIT
+AC_PREFIX_DEFAULT([/usr])
+AC_PROG_SED
+AC_PROG_MKDIR_P
+AC_CHECK_PROG(HAVE_YASM, yasm, yes, no)
+if test "$HAVE_YASM" = "no"; then
+  AC_MSG_ERROR([yasm not found as required.])
+fi
+AC_MSG_CHECKING([checking for modern yasm])
+AC_LANG_CONFTEST([AC_LANG_SOURCE([[vmovdqa %xmm0, %xmm1;]])])
+if yasm -f elf64 -p gas conftest.c ; then
+   AC_MSG_RESULT([yes])
+else
+   AC_MSG_FAILURE([need modern yasm])
+fi
+
+# Options
+AC_ARG_ENABLE([debug],
+        AS_HELP_STRING([--enable-debug], [enable debug messages @<:@default=disabled@:>@]),
+        [], [enable_debug=no])
+AS_IF([test "x$enable_debug" = "xyes"], [
+        AC_DEFINE(ENABLE_DEBUG, [1], [Debug messages.])
+])
+
+
+case $target in
+     *linux*)	arch=linux   yasm_args="-f elf64";;
+     *darwin*)	arch=darwin  yasm_args="-f macho64 --prefix=_ ";;
+     *netbsd*)	arch=netbsd  yasm_args="-f elf64";;
+     *)		arch=unknown yasm_args="-f elf64";;
+esac
+AC_SUBST([yasm_args])
+AM_CONDITIONAL(DARWIN, test x"$arch" = x"darwin")
+AC_MSG_RESULT([Using yasm args target "$arch" "$yasm_args"])
+
+# Check for header files
+#AC_CHECK_HEADERS([limits.h stddef.h stdint.h stdlib.h string.h sys/time.h unistd.h])
+AC_CHECK_HEADERS([limits.h stdint.h stdlib.h string.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_C_INLINE
+AC_TYPE_SIZE_T
+AC_TYPE_UINT16_T
+AC_TYPE_UINT32_T
+AC_TYPE_UINT64_T
+AC_TYPE_UINT8_T
+
+# Checks for library functions.
+AC_FUNC_MALLOC  # Used only in tests
+AC_CHECK_FUNCS([memmove memset])
+
+my_CFLAGS="\
+-Wall \
+-Wchar-subscripts \
+-Wformat-security \
+-Wnested-externs \
+-Wpointer-arith \
+-Wshadow \
+-Wstrict-prototypes \
+-Wtype-limits \
+"
+AC_SUBST([my_CFLAGS])
+
+AC_CONFIG_FILES([\
+	Makefile\
+	libisal.pc
+])
+
+AC_OUTPUT
+AC_MSG_RESULT([
+        $PACKAGE $VERSION
+        =====
+
+        prefix:                 ${prefix}
+        sysconfdir:             ${sysconfdir}
+        libdir:                 ${libdir}
+        includedir:             ${includedir}
+
+        compiler:               ${CC}
+        cflags:                 ${CFLAGS}
+        ldflags:                ${LDFLAGS}
+
+        debug:                  ${enable_debug}
+])
--- a/erasure_code/Makefile.am
+++ b/erasure_code/Makefile.am
@ -0,0 +1,159 @@
+########################################################################
+#  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions 
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc         += erasure_code/ec_highlevel_func.c \
+		erasure_code/ec_base.c \
+		erasure_code/gf_vect_mul_sse.asm \
+		erasure_code/gf_vect_mul_avx.asm \
+		erasure_code/gf_vect_dot_prod_sse.asm  \
+		erasure_code/gf_vect_dot_prod_avx.asm \
+		erasure_code/gf_vect_dot_prod_avx2.asm \
+		erasure_code/gf_2vect_dot_prod_sse.asm \
+		erasure_code/gf_3vect_dot_prod_sse.asm \
+		erasure_code/gf_4vect_dot_prod_sse.asm \
+		erasure_code/gf_5vect_dot_prod_sse.asm \
+		erasure_code/gf_6vect_dot_prod_sse.asm \
+		erasure_code/gf_2vect_dot_prod_avx.asm \
+		erasure_code/gf_3vect_dot_prod_avx.asm \
+		erasure_code/gf_4vect_dot_prod_avx.asm \
+		erasure_code/gf_5vect_dot_prod_avx.asm \
+		erasure_code/gf_6vect_dot_prod_avx.asm \
+		erasure_code/gf_2vect_dot_prod_avx2.asm \
+		erasure_code/gf_3vect_dot_prod_avx2.asm \
+		erasure_code/gf_4vect_dot_prod_avx2.asm \
+		erasure_code/gf_5vect_dot_prod_avx2.asm \
+		erasure_code/gf_6vect_dot_prod_avx2.asm \
+		erasure_code/gf_vect_mad_sse.asm \
+		erasure_code/gf_2vect_mad_sse.asm \
+		erasure_code/gf_3vect_mad_sse.asm \
+		erasure_code/gf_4vect_mad_sse.asm \
+		erasure_code/gf_5vect_mad_sse.asm \
+		erasure_code/gf_6vect_mad_sse.asm \
+		erasure_code/gf_vect_mad_avx.asm \
+		erasure_code/gf_2vect_mad_avx.asm \
+		erasure_code/gf_3vect_mad_avx.asm \
+		erasure_code/gf_4vect_mad_avx.asm \
+		erasure_code/gf_5vect_mad_avx.asm \
+		erasure_code/gf_6vect_mad_avx.asm \
+		erasure_code/gf_vect_mad_avx2.asm \
+		erasure_code/gf_2vect_mad_avx2.asm \
+		erasure_code/gf_3vect_mad_avx2.asm \
+		erasure_code/gf_4vect_mad_avx2.asm \
+		erasure_code/gf_5vect_mad_avx2.asm \
+		erasure_code/gf_6vect_mad_avx2.asm \
+		erasure_code/ec_multibinary.asm
+
+lsrc32	     += erasure_code/ec_highlevel_func.c \
+		erasure_code/ec_multibinary.asm \
+		erasure_code/ec_base.c \
+		erasure_code/gf_vect_dot_prod_avx.asm \
+		erasure_code/gf_2vect_dot_prod_avx.asm \
+		erasure_code/gf_3vect_dot_prod_avx.asm \
+		erasure_code/gf_4vect_dot_prod_avx.asm \
+		erasure_code/gf_vect_dot_prod_sse.asm \
+		erasure_code/gf_2vect_dot_prod_sse.asm \
+		erasure_code/gf_3vect_dot_prod_sse.asm \
+		erasure_code/gf_4vect_dot_prod_sse.asm \
+		erasure_code/gf_vect_dot_prod_avx2.asm \
+		erasure_code/gf_2vect_dot_prod_avx2.asm \
+		erasure_code/gf_3vect_dot_prod_avx2.asm \
+		erasure_code/gf_4vect_dot_prod_avx2.asm
+
+unit_tests32 += erasure_code_base_test \
+		erasure_code/erasure_code_test \
+		erasure_code/erasure_code_sse_test \
+		erasure_code/gf_vect_mul_test \
+		erasure_code/gf_vect_mul_base_test \
+		erasure_code/gf_vect_dot_prod_base_test \
+		erasure_code/gf_vect_dot_prod_test \
+		erasure_code/gf_vect_dot_prod_avx_test \
+		erasure_code/gf_vect_dot_prod_sse_test \
+		erasure_code/gf_2vect_dot_prod_sse_test \
+		erasure_code/gf_3vect_dot_prod_sse_test \
+		erasure_code/gf_4vect_dot_prod_sse_test
+
+perf_tests32 += erasure_code/gf_vect_mul_perf \
+		erasure_code/gf_vect_dot_prod_perf \
+		erasure_code/erasure_code_perf \
+		erasure_code/erasure_code_base_perf \
+		erasure_code/erasure_code_sse_perf \
+		erasure_code/gf_vect_dot_prod_1tbl \
+		erasure_code/gf_vect_dot_prod_avx_perf\
+		erasure_code/gf_vect_dot_prod_sse_perf\
+		erasure_code/gf_2vect_dot_prod_sse_perf \
+		erasure_code/gf_3vect_dot_prod_sse_perf \
+		erasure_code/gf_4vect_dot_prod_sse_perf
+
+extern_hdrs  += include/erasure_code.h \
+		include/gf_vect_mul.h
+
+other_src    += erasure_code/ec_base.h \
+		include/reg_sizes.asm
+
+check_tests  += erasure_code/gf_vect_mul_test \
+		erasure_code/erasure_code_test \
+		erasure_code/gf_inverse_test \
+		erasure_code/erasure_code_update_test
+
+unit_tests   += erasure_code/gf_vect_mul_sse_test \
+		erasure_code/gf_vect_mul_avx_test \
+		erasure_code/gf_vect_mul_base_test \
+		erasure_code/gf_vect_dot_prod_sse_test \
+		erasure_code/gf_vect_dot_prod_avx_test \
+		erasure_code/gf_2vect_dot_prod_sse_test \
+		erasure_code/gf_3vect_dot_prod_sse_test \
+		erasure_code/gf_4vect_dot_prod_sse_test \
+		erasure_code/gf_5vect_dot_prod_sse_test \
+		erasure_code/gf_6vect_dot_prod_sse_test \
+		erasure_code/gf_vect_dot_prod_base_test \
+		erasure_code/gf_vect_dot_prod_test \
+		erasure_code/gf_vect_mad_test \
+		erasure_code/erasure_code_base_test \
+		erasure_code/erasure_code_sse_test
+
+perf_tests   += erasure_code/gf_vect_mul_perf \
+		erasure_code/gf_vect_mul_sse_perf \
+		erasure_code/gf_vect_mul_avx_perf \
+		erasure_code/gf_vect_dot_prod_sse_perf \
+		erasure_code/gf_vect_dot_prod_avx_perf \
+		erasure_code/gf_2vect_dot_prod_sse_perf \
+		erasure_code/gf_3vect_dot_prod_sse_perf \
+		erasure_code/gf_4vect_dot_prod_sse_perf \
+		erasure_code/gf_5vect_dot_prod_sse_perf \
+		erasure_code/gf_6vect_dot_prod_sse_perf \
+		erasure_code/gf_vect_dot_prod_perf \
+		erasure_code/gf_vect_dot_prod_1tbl \
+		erasure_code/gf_vect_mad_perf \
+		erasure_code/erasure_code_perf \
+		erasure_code/erasure_code_base_perf \
+		erasure_code/erasure_code_sse_perf \
+		erasure_code/erasure_code_update_perf
+
+other_src    += include/test.h \
+		include/types.h
--- a/erasure_code/ec_base.c
+++ b/erasure_code/ec_base.c
@ -0,0 +1,360 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <limits.h>
+#include <string.h>		// for memset
+#include "erasure_code.h"
+#include "ec_base.h"		// for GF tables
+#include "types.h"
+
+unsigned char gf_mul(unsigned char a, unsigned char b)
+{
+#ifndef GF_LARGE_TABLES
+	int i;
+
+	if ((a == 0) || (b == 0))
+		return 0;
+
+	return gff_base[(i = gflog_base[a] + gflog_base[b]) > 254 ? i - 255 : i];
+#else
+	return gf_mul_table_base[b * 256 + a];
+#endif
+}
+
+unsigned char gf_inv(unsigned char a)
+{
+#ifndef GF_LARGE_TABLES
+	if (a == 0)
+		return 0;
+
+	return gff_base[255 - gflog_base[a]];
+#else
+	return gf_inv_table_base[a];
+#endif
+}
+
+void gf_gen_rs_matrix(unsigned char *a, int m, int k)
+{
+	int i, j;
+	unsigned char p, gen = 1;
+
+	memset(a, 0, k * m);
+	for (i = 0; i < k; i++)
+		a[k * i + i] = 1;
+
+	for (i = k; i < m; i++) {
+		p = 1;
+		for (j = 0; j < k; j++) {
+			a[k * i + j] = p;
+			p = gf_mul(p, gen);
+		}
+		gen = gf_mul(gen, 2);
+	}
+}
+
+void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k)
+{
+	int i, j;
+	unsigned char *p;
+
+	// Identity matrix in high position
+	memset(a, 0, k * m);
+	for (i = 0; i < k; i++)
+		a[k * i + i] = 1;
+
+	// For the rest choose 1/(i + j) | i != j
+	p = &a[k * k];
+	for (i = k; i < m; i++)
+		for (j = 0; j < k; j++)
+			*p++ = gf_inv(i ^ j);
+
+}
+
+int gf_invert_matrix(unsigned char *in_mat, unsigned char *out_mat, const int n)
+{
+	int i, j, k;
+	unsigned char temp;
+
+	// Set out_mat[] to the identity matrix
+	for (i = 0; i < n * n; i++)	// memset(out_mat, 0, n*n)
+		out_mat[i] = 0;
+
+	for (i = 0; i < n; i++)
+		out_mat[i * n + i] = 1;
+
+	// Inverse
+	for (i = 0; i < n; i++) {
+		// Check for 0 in pivot element
+		if (in_mat[i * n + i] == 0) {
+			// Find a row with non-zero in current column and swap
+			for (j = i + 1; j < n; j++)
+				if (in_mat[j * n + i])
+					break;
+
+			if (j == n)	// Couldn't find means it's singular
+				return -1;
+
+			for (k = 0; k < n; k++) {	// Swap rows i,j
+				temp = in_mat[i * n + k];
+				in_mat[i * n + k] = in_mat[j * n + k];
+				in_mat[j * n + k] = temp;
+
+				temp = out_mat[i * n + k];
+				out_mat[i * n + k] = out_mat[j * n + k];
+				out_mat[j * n + k] = temp;
+			}
+		}
+
+		temp = gf_inv(in_mat[i * n + i]);	// 1/pivot
+		for (j = 0; j < n; j++) {	// Scale row i by 1/pivot
+			in_mat[i * n + j] = gf_mul(in_mat[i * n + j], temp);
+			out_mat[i * n + j] = gf_mul(out_mat[i * n + j], temp);
+		}
+
+		for (j = 0; j < n; j++) {
+			if (j == i)
+				continue;
+
+			temp = in_mat[j * n + i];
+			for (k = 0; k < n; k++) {
+				out_mat[j * n + k] ^= gf_mul(temp, out_mat[i * n + k]);
+				in_mat[j * n + k] ^= gf_mul(temp, in_mat[i * n + k]);
+			}
+		}
+	}
+	return 0;
+}
+
+// Calculates const table gftbl in GF(2^8) from single input A
+// gftbl(A) = {A{00}, A{01}, A{02}, ... , A{0f} }, {A{00}, A{10}, A{20}, ... , A{f0} }
+
+void gf_vect_mul_init(unsigned char c, unsigned char *tbl)
+{
+	unsigned char c2 = (c << 1) ^ ((c & 0x80) ? 0x1d : 0);	//Mult by GF{2}
+	unsigned char c4 = (c2 << 1) ^ ((c2 & 0x80) ? 0x1d : 0);	//Mult by GF{2}
+	unsigned char c8 = (c4 << 1) ^ ((c4 & 0x80) ? 0x1d : 0);	//Mult by GF{2}
+
+#if __WORDSIZE == 64 || _WIN64 || __x86_64__
+	unsigned long long v1, v2, v4, v8, *t;
+	unsigned long long v10, v20, v40, v80;
+	unsigned char c17, c18, c20, c24;
+
+	t = (unsigned long long *)tbl;
+
+	v1 = c * 0x0100010001000100ull;
+	v2 = c2 * 0x0101000001010000ull;
+	v4 = c4 * 0x0101010100000000ull;
+	v8 = c8 * 0x0101010101010101ull;
+
+	v4 = v1 ^ v2 ^ v4;
+	t[0] = v4;
+	t[1] = v8 ^ v4;
+
+	c17 = (c8 << 1) ^ ((c8 & 0x80) ? 0x1d : 0);	//Mult by GF{2}
+	c18 = (c17 << 1) ^ ((c17 & 0x80) ? 0x1d : 0);	//Mult by GF{2}
+	c20 = (c18 << 1) ^ ((c18 & 0x80) ? 0x1d : 0);	//Mult by GF{2}
+	c24 = (c20 << 1) ^ ((c20 & 0x80) ? 0x1d : 0);	//Mult by GF{2}
+
+	v10 = c17 * 0x0100010001000100ull;
+	v20 = c18 * 0x0101000001010000ull;
+	v40 = c20 * 0x0101010100000000ull;
+	v80 = c24 * 0x0101010101010101ull;
+
+	v40 = v10 ^ v20 ^ v40;
+	t[2] = v40;
+	t[3] = v80 ^ v40;
+
+#else // 32-bit or other
+	unsigned char c3, c5, c6, c7, c9, c10, c11, c12, c13, c14, c15;
+	unsigned char c17, c18, c19, c20, c21, c22, c23, c24, c25, c26, c27, c28, c29, c30,
+	    c31;
+
+	c3 = c2 ^ c;
+	c5 = c4 ^ c;
+	c6 = c4 ^ c2;
+	c7 = c4 ^ c3;
+
+	c9 = c8 ^ c;
+	c10 = c8 ^ c2;
+	c11 = c8 ^ c3;
+	c12 = c8 ^ c4;
+	c13 = c8 ^ c5;
+	c14 = c8 ^ c6;
+	c15 = c8 ^ c7;
+
+	tbl[0] = 0;
+	tbl[1] = c;
+	tbl[2] = c2;
+	tbl[3] = c3;
+	tbl[4] = c4;
+	tbl[5] = c5;
+	tbl[6] = c6;
+	tbl[7] = c7;
+	tbl[8] = c8;
+	tbl[9] = c9;
+	tbl[10] = c10;
+	tbl[11] = c11;
+	tbl[12] = c12;
+	tbl[13] = c13;
+	tbl[14] = c14;
+	tbl[15] = c15;
+
+	c17 = (c8 << 1) ^ ((c8 & 0x80) ? 0x1d : 0);	//Mult by GF{2}
+	c18 = (c17 << 1) ^ ((c17 & 0x80) ? 0x1d : 0);	//Mult by GF{2}
+	c19 = c18 ^ c17;
+	c20 = (c18 << 1) ^ ((c18 & 0x80) ? 0x1d : 0);	//Mult by GF{2}
+	c21 = c20 ^ c17;
+	c22 = c20 ^ c18;
+	c23 = c20 ^ c19;
+	c24 = (c20 << 1) ^ ((c20 & 0x80) ? 0x1d : 0);	//Mult by GF{2}
+	c25 = c24 ^ c17;
+	c26 = c24 ^ c18;
+	c27 = c24 ^ c19;
+	c28 = c24 ^ c20;
+	c29 = c24 ^ c21;
+	c30 = c24 ^ c22;
+	c31 = c24 ^ c23;
+
+	tbl[16] = 0;
+	tbl[17] = c17;
+	tbl[18] = c18;
+	tbl[19] = c19;
+	tbl[20] = c20;
+	tbl[21] = c21;
+	tbl[22] = c22;
+	tbl[23] = c23;
+	tbl[24] = c24;
+	tbl[25] = c25;
+	tbl[26] = c26;
+	tbl[27] = c27;
+	tbl[28] = c28;
+	tbl[29] = c29;
+	tbl[30] = c30;
+	tbl[31] = c31;
+
+#endif //__WORDSIZE == 64 || _WIN64 || __x86_64__
+}
+
+void gf_vect_dot_prod_base(int len, int vlen, unsigned char *v,
+			   unsigned char **src, unsigned char *dest)
+{
+	int i, j;
+	unsigned char s;
+	for (i = 0; i < len; i++) {
+		s = 0;
+		for (j = 0; j < vlen; j++)
+			s ^= gf_mul(src[j][i], v[j * 32 + 1]);
+
+		dest[i] = s;
+	}
+}
+
+void gf_vect_mad_base(int len, int vec, int vec_i,
+		      unsigned char *v, unsigned char *src, unsigned char *dest)
+{
+	int i;
+	unsigned char s;
+	for (i = 0; i < len; i++) {
+		s = dest[i];
+		s ^= gf_mul(src[i], v[vec_i * 32 + 1]);
+		dest[i] = s;
+	}
+}
+
+void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v,
+			 unsigned char **src, unsigned char **dest)
+{
+	int i, j, l;
+	unsigned char s;
+
+	for (l = 0; l < dests; l++) {
+		for (i = 0; i < len; i++) {
+			s = 0;
+			for (j = 0; j < srcs; j++)
+				s ^= gf_mul(src[j][i], v[j * 32 + l * srcs * 32 + 1]);
+
+			dest[l][i] = s;
+		}
+	}
+}
+
+void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v,
+				unsigned char *data, unsigned char **dest)
+{
+	int i, l;
+	unsigned char s;
+
+	for (l = 0; l < rows; l++) {
+		for (i = 0; i < len; i++) {
+			s = dest[l][i];
+			s ^= gf_mul(data[i], v[vec_i * 32 + l * k * 32 + 1]);
+
+			dest[l][i] = s;
+		}
+	}
+}
+
+void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, unsigned char *dest)
+{
+	//2nd element of table array is ref value used to fill it in
+	unsigned char c = a[1];
+	while (len-- > 0)
+		*dest++ = gf_mul(c, *src++);
+}
+
+struct slver {
+	UINT16 snum;
+	UINT8 ver;
+	UINT8 core;
+};
+
+// Version info
+struct slver gf_vect_mul_init_slver_00020035;
+struct slver gf_vect_mul_init_slver = { 0x0035, 0x02, 0x00 };
+
+struct slver ec_encode_data_base_slver_00010135;
+struct slver ec_encode_data_base_slver = { 0x0135, 0x01, 0x00 };
+
+struct slver gf_vect_mul_base_slver_00010136;
+struct slver gf_vect_mul_base_slver = { 0x0136, 0x01, 0x00 };
+
+struct slver gf_vect_dot_prod_base_slver_00010137;
+struct slver gf_vect_dot_prod_base_slver = { 0x0137, 0x01, 0x00 };
+
+struct slver gf_mul_slver_00000214;
+struct slver gf_mul_slver = { 0x0214, 0x00, 0x00 };
+
+struct slver gf_invert_matrix_slver_00000215;
+struct slver gf_invert_matrix_slver = { 0x0215, 0x00, 0x00};
+
+struct slver gf_gen_rs_matrix_slver_00000216;
+struct slver gf_gen_rs_matrix_slver = { 0x0216, 0x00, 0x00 };
+
+struct slver gf_gen_cauchy1_matrix_slver_00000217;
+struct slver gf_gen_cauchy1_matrix_slver = { 0x0217, 0x00, 0x00};
--- a/erasure_code/ec_base.h
+++ b/erasure_code/ec_base.h
--- a/erasure_code/ec_highlevel_func.c
+++ b/erasure_code/ec_highlevel_func.c
@ -0,0 +1,267 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <limits.h>
+#include "erasure_code.h"
+#include "types.h"
+
+void ec_init_tables(int k, int rows, unsigned char *a, unsigned char *g_tbls)
+{
+	int i, j;
+
+	for (i = 0; i < rows; i++) {
+		for (j = 0; j < k; j++) {
+			gf_vect_mul_init(*a++, g_tbls);
+			g_tbls += 32;
+		}
+	}
+}
+
+void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
+			unsigned char **coding)
+{
+
+	if (len < 16) {
+		ec_encode_data_base(len, k, rows, g_tbls, data, coding);
+		return;
+	}
+
+	while (rows >= 4) {
+		gf_4vect_dot_prod_sse(len, k, g_tbls, data, coding);
+		g_tbls += 4 * k * 32;
+		coding += 4;
+		rows -= 4;
+	}
+	switch (rows) {
+	case 3:
+		gf_3vect_dot_prod_sse(len, k, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_dot_prod_sse(len, k, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_dot_prod_sse(len, k, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+
+}
+
+void ec_encode_data_avx(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
+			unsigned char **coding)
+{
+	if (len < 16) {
+		ec_encode_data_base(len, k, rows, g_tbls, data, coding);
+		return;
+	}
+
+	while (rows >= 4) {
+		gf_4vect_dot_prod_avx(len, k, g_tbls, data, coding);
+		g_tbls += 4 * k * 32;
+		coding += 4;
+		rows -= 4;
+	}
+	switch (rows) {
+	case 3:
+		gf_3vect_dot_prod_avx(len, k, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_dot_prod_avx(len, k, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_dot_prod_avx(len, k, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+
+}
+
+void ec_encode_data_avx2(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
+			 unsigned char **coding)
+{
+
+	if (len < 32) {
+		ec_encode_data_base(len, k, rows, g_tbls, data, coding);
+		return;
+	}
+
+	while (rows >= 4) {
+		gf_4vect_dot_prod_avx2(len, k, g_tbls, data, coding);
+		g_tbls += 4 * k * 32;
+		coding += 4;
+		rows -= 4;
+	}
+	switch (rows) {
+	case 3:
+		gf_3vect_dot_prod_avx2(len, k, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_dot_prod_avx2(len, k, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_dot_prod_avx2(len, k, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+
+}
+
+#if __WORDSIZE == 64 || _WIN64 || __x86_64__
+
+void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding)
+{
+	if (len < 16) {
+		ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+		return;
+	}
+
+	while (rows > 6) {
+		gf_6vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		g_tbls += 6 * k * 32;
+		coding += 6;
+		rows -= 6;
+	}
+	switch (rows) {
+	case 6:
+		gf_6vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 5:
+		gf_5vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 4:
+		gf_4vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 3:
+		gf_3vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_mad_sse(len, k, vec_i, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+
+}
+
+void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding)
+{
+	if (len < 16) {
+		ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+		return;
+	}
+	while (rows > 6) {
+		gf_6vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		g_tbls += 6 * k * 32;
+		coding += 6;
+		rows -= 6;
+	}
+	switch (rows) {
+	case 6:
+		gf_6vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 5:
+		gf_5vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 4:
+		gf_4vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 3:
+		gf_3vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_mad_avx(len, k, vec_i, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+
+}
+
+void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+				unsigned char *data, unsigned char **coding)
+{
+	if (len < 32) {
+		ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+		return;
+	}
+	while (rows > 6) {
+		gf_6vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		g_tbls += 6 * k * 32;
+		coding += 6;
+		rows -= 6;
+	}
+	switch (rows) {
+	case 6:
+		gf_6vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 5:
+		gf_5vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 4:
+		gf_4vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 3:
+		gf_3vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_mad_avx2(len, k, vec_i, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+
+}
+
+#endif //__WORDSIZE == 64 || _WIN64 || __x86_64__
+
+struct slver {
+	UINT16 snum;
+	UINT8 ver;
+	UINT8 core;
+};
+
+// Version info
+struct slver ec_init_tables_slver_00010068;
+struct slver ec_init_tables_slver = { 0x0068, 0x01, 0x00 };
+
+struct slver ec_encode_data_sse_slver_00020069;
+struct slver ec_encode_data_sse_slver = { 0x0069, 0x02, 0x00 };
--- a/erasure_code/ec_multibinary.asm
+++ b/erasure_code/ec_multibinary.asm
@ -0,0 +1,395 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define WRT_OPT		wrt ..plt
+%else
+ %define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+[bits 32]
+
+ %define def_wrd		dd
+ %define wrd_sz  	dword
+ %define arg1		esi
+ %define arg2		eax
+ %define arg3		ebx
+ %define arg4		ecx
+ %define arg5		edx
+
+%else
+
+ default rel
+ [bits 64]
+
+ %define def_wrd 	dq
+ %define wrd_sz  	qword
+ %define arg1		rsi
+ %define arg2		rax
+ %define arg3		rbx
+ %define arg4		rcx
+ %define arg5		rdx
+
+
+ extern ec_encode_data_update_sse
+ extern ec_encode_data_update_avx
+ extern ec_encode_data_update_avx2
+ extern gf_vect_mul_sse
+ extern gf_vect_mul_avx
+
+ extern gf_vect_mad_sse
+ extern gf_vect_mad_avx
+ extern gf_vect_mad_avx2
+%endif
+
+extern gf_vect_mul_base
+extern ec_encode_data_base
+extern ec_encode_data_update_base
+extern gf_vect_dot_prod_base
+extern gf_vect_mad_base
+
+extern gf_vect_dot_prod_sse
+extern gf_vect_dot_prod_avx
+extern gf_vect_dot_prod_avx2
+extern ec_encode_data_sse
+extern ec_encode_data_avx
+extern ec_encode_data_avx2
+
+
+section .data
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+ec_encode_data_dispatched:
+	def_wrd      ec_encode_data_mbinit
+
+gf_vect_mul_dispatched:
+	def_wrd      gf_vect_mul_mbinit
+
+gf_vect_dot_prod_dispatched:
+	def_wrd      gf_vect_dot_prod_mbinit
+
+ec_encode_data_update_dispatched:
+	def_wrd      ec_encode_data_update_mbinit
+
+gf_vect_mad_dispatched:
+	def_wrd      gf_vect_mad_mbinit
+
+section .text
+;;;;
+; ec_encode_data multibinary function
+;;;;
+global ec_encode_data:function
+ec_encode_data_mbinit:
+	call	ec_encode_data_dispatch_init
+
+ec_encode_data:
+	jmp	wrd_sz [ec_encode_data_dispatched]
+
+ec_encode_data_dispatch_init:
+	push    arg1
+	push    arg2
+	push    arg3
+	push    arg4
+	push    arg5
+	lea     arg1, [ec_encode_data_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	lea     arg3, [ec_encode_data_sse WRT_OPT]
+	test    ecx, FLAG_CPUID1_ECX_SSE4_1
+	cmovne  arg1, arg3
+
+	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	lea	arg3, [ec_encode_data_avx WRT_OPT]
+
+	jne	_done_ec_encode_data_init
+	mov	arg1, arg3
+
+	;; Try for AVX2
+	xor	ecx, ecx
+	mov	eax, 7
+	cpuid
+	test	ebx, FLAG_CPUID1_EBX_AVX2
+	lea     arg3, [ec_encode_data_avx2 WRT_OPT]
+	cmovne	arg1, arg3
+	;; Does it have xmm and ymm support
+	xor	ecx, ecx
+	xgetbv
+	and	eax, FLAG_XGETBV_EAX_XMM_YMM
+	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+	je	_done_ec_encode_data_init
+	lea     arg1, [ec_encode_data_sse WRT_OPT]
+
+_done_ec_encode_data_init:
+	pop     arg5
+	pop     arg4
+	pop     arg3
+	pop     arg2
+	mov     [ec_encode_data_dispatched], arg1
+	pop     arg1
+	ret
+
+;;;;
+; gf_vect_mul multibinary function
+;;;;
+global gf_vect_mul:function
+gf_vect_mul_mbinit:
+	call    gf_vect_mul_dispatch_init
+
+gf_vect_mul:
+	jmp	wrd_sz [gf_vect_mul_dispatched]
+
+gf_vect_mul_dispatch_init:
+	push    arg1
+%ifidn __OUTPUT_FORMAT__, elf32		;; 32-bit check
+	lea     arg1, [gf_vect_mul_base]
+%else
+	push    rax
+	push    rbx
+	push    rcx
+	push    rdx
+	lea     arg1, [gf_vect_mul_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	test    ecx, FLAG_CPUID1_ECX_SSE4_2
+	lea     rbx, [gf_vect_mul_sse WRT_OPT]
+	je	_done_gf_vect_mul_dispatch_init
+	mov  	arg1, rbx
+
+	;; Try for AVX
+	and     ecx, (FLAG_CPUID1_ECX_OSXSAVE | FLAG_CPUID1_ECX_AVX)
+	cmp     ecx, (FLAG_CPUID1_ECX_OSXSAVE | FLAG_CPUID1_ECX_AVX)
+	jne     _done_gf_vect_mul_dispatch_init
+
+	;; Does it have xmm and ymm support
+	xor     ecx, ecx
+	xgetbv
+	and     eax, FLAG_XGETBV_EAX_XMM_YMM
+	cmp     eax, FLAG_XGETBV_EAX_XMM_YMM
+	jne     _done_gf_vect_mul_dispatch_init
+	lea     arg1, [gf_vect_mul_avx WRT_OPT]
+
+_done_gf_vect_mul_dispatch_init:
+	pop     rdx
+	pop     rcx
+	pop     rbx
+	pop     rax
+%endif			;; END 32-bit check
+	mov     [gf_vect_mul_dispatched], arg1
+	pop     arg1
+	ret
+
+;;;;
+; ec_encode_data_update multibinary function
+;;;;
+global ec_encode_data_update:function
+ec_encode_data_update_mbinit:
+	call	ec_encode_data_update_dispatch_init
+
+ec_encode_data_update:
+	jmp	wrd_sz [ec_encode_data_update_dispatched]
+
+ec_encode_data_update_dispatch_init:
+	push    arg1
+%ifidn __OUTPUT_FORMAT__, elf32		;; 32-bit check
+	lea     arg1, [ec_encode_data_update_base]
+%else
+	push    rax
+	push    rbx
+	push    rcx
+	push    rdx
+	lea     arg1, [ec_encode_data_update_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	lea     rbx, [ec_encode_data_update_sse WRT_OPT]
+	test    ecx, FLAG_CPUID1_ECX_SSE4_1
+	cmovne  arg1, rbx
+
+	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	lea	rbx, [ec_encode_data_update_avx WRT_OPT]
+
+	jne	_done_ec_encode_data_update_init
+	mov	rsi, rbx
+
+	;; Try for AVX2
+	xor	ecx, ecx
+	mov	eax, 7
+	cpuid
+	test	ebx, FLAG_CPUID1_EBX_AVX2
+	lea     rbx, [ec_encode_data_update_avx2 WRT_OPT]
+	cmovne	rsi, rbx
+
+	;; Does it have xmm and ymm support
+	xor	ecx, ecx
+	xgetbv
+	and	eax, FLAG_XGETBV_EAX_XMM_YMM
+	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+	je	_done_ec_encode_data_update_init
+	lea     rsi, [ec_encode_data_update_sse WRT_OPT]
+
+_done_ec_encode_data_update_init:
+	pop     rdx
+	pop     rcx
+	pop     rbx
+	pop     rax
+%endif			;; END 32-bit check
+	mov     [ec_encode_data_update_dispatched], arg1
+	pop     arg1
+	ret
+
+;;;;
+; gf_vect_dot_prod multibinary function
+;;;;
+global gf_vect_dot_prod:function
+gf_vect_dot_prod_mbinit:
+	call    gf_vect_dot_prod_dispatch_init
+
+gf_vect_dot_prod:
+	jmp     wrd_sz [gf_vect_dot_prod_dispatched]
+
+gf_vect_dot_prod_dispatch_init:
+	push    arg1
+	push    arg2
+	push    arg3
+	push    arg4
+	push    arg5
+	lea     arg1, [gf_vect_dot_prod_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	lea     arg3, [gf_vect_dot_prod_sse WRT_OPT]
+	test    ecx, FLAG_CPUID1_ECX_SSE4_1
+	cmovne  arg1, arg3
+
+	and		ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	cmp		ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	lea     arg3, [gf_vect_dot_prod_avx WRT_OPT]
+
+	jne     _done_gf_vect_dot_prod_init
+	mov		arg1, arg3
+
+	;; Try for AVX2
+	xor		ecx, ecx
+	mov		eax, 7
+	cpuid
+	test	ebx, FLAG_CPUID1_EBX_AVX2
+	lea     arg3, [gf_vect_dot_prod_avx2 WRT_OPT]
+	cmovne	arg1, arg3
+	;; Does it have xmm and ymm support
+	xor	ecx, ecx
+	xgetbv
+	and	eax, FLAG_XGETBV_EAX_XMM_YMM
+	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+	je	_done_gf_vect_dot_prod_init
+	lea     arg1, [gf_vect_dot_prod_sse WRT_OPT]
+
+_done_gf_vect_dot_prod_init:
+	pop     arg5
+	pop     arg4
+	pop     arg3
+	pop     arg2
+	mov     [gf_vect_dot_prod_dispatched], arg1
+	pop	arg1
+	ret
+
+;;;;
+; gf_vect_mad multibinary function
+;;;;
+global gf_vect_mad:function
+gf_vect_mad_mbinit:
+	call    gf_vect_mad_dispatch_init
+
+gf_vect_mad:
+	jmp     wrd_sz [gf_vect_mad_dispatched]
+
+gf_vect_mad_dispatch_init:
+	push    arg1
+%ifidn __OUTPUT_FORMAT__, elf32         ;; 32-bit check
+	lea     arg1, [gf_vect_mad_base]
+%else
+	push	rax
+	push	rbx
+	push	rcx
+	push	rdx
+	lea     arg1, [gf_vect_mad_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	lea     rbx, [gf_vect_mad_sse WRT_OPT]
+	test    ecx, FLAG_CPUID1_ECX_SSE4_1
+	cmovne  arg1, rbx
+
+	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	lea     rbx, [gf_vect_mad_avx WRT_OPT]
+
+	jne     _done_gf_vect_mad_init
+	mov	rsi, rbx
+
+	;; Try for AVX2
+	xor	ecx, ecx
+	mov	eax, 7
+	cpuid
+	test	ebx, FLAG_CPUID1_EBX_AVX2
+	lea     rbx, [gf_vect_mad_avx2 WRT_OPT]
+	cmovne	rsi, rbx
+
+	;; Does it have xmm and ymm support
+	xor	ecx, ecx
+	xgetbv
+	and	eax, FLAG_XGETBV_EAX_XMM_YMM
+	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+	je	_done_gf_vect_mad_init
+	lea     rsi, [gf_vect_mad_sse WRT_OPT]
+
+_done_gf_vect_mad_init:
+	pop     rdx
+	pop     rcx
+	pop     rbx
+	pop     rax
+%endif			;; END 32-bit check
+	mov     [gf_vect_mad_dispatched], arg1
+	pop	arg1
+	ret
+
+;;;       func                 		core, ver, snum
+slversion ec_encode_data,		00,   04,  0133
+slversion gf_vect_mul,			00,   03,  0134
+slversion ec_encode_data_update,	00,   03,  0212
+slversion gf_vect_dot_prod,		00,   03,  0138
+slversion gf_vect_mad,			00,   02,  0213
--- a/erasure_code/erasure_code_base_perf.c
+++ b/erasure_code/erasure_code_base_perf.c
@ -0,0 +1,168 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 32
+# define TEST_LEN(m)  ((128*1024 / m) & ~(64-1))
+# define TEST_LOOPS(m)   (100*m)
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 32
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN(m)  ((GT_L3_CACHE / m) & ~(64-1))
+#  define TEST_LOOPS(m)   (10)
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS(m) 1000
+#  endif
+# endif
+#endif
+
+#define MMAX TEST_SOURCES
+#define KMAX TEST_SOURCES
+
+typedef unsigned char u8;
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, m, k, nerrs, r;
+	void *buf;
+	u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
+	u8 a[MMAX * KMAX], b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX];
+	u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
+	u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
+	struct perf start, stop;
+
+	// Pick test parameters
+	m = 14;
+	k = 10;
+	nerrs = 4;
+	const u8 err_list[] = { 2, 4, 5, 7 };
+
+	printf("erasure_code_base_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
+
+	if (m > MMAX || k > KMAX || nerrs > (m - k)) {
+		printf(" Input test parameter error\n");
+		return -1;
+	}
+
+	memcpy(src_err_list, err_list, nerrs);
+	memset(src_in_err, 0, TEST_SOURCES);
+	for (i = 0; i < nerrs; i++)
+		src_in_err[src_err_list[i]] = 1;
+
+	// Allocate the arrays
+	for (i = 0; i < m; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN(m))) {
+			printf("alloc error: Fail\n");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	for (i = 0; i < (m - k); i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN(m))) {
+			printf("alloc error: Fail\n");
+			return -1;
+		}
+		temp_buffs[i] = buf;
+	}
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN(m); j++)
+			buffs[i][j] = rand();
+
+	gf_gen_rs_matrix(a, m, k);
+	ec_init_tables(k, m - k, &a[k * k], g_tbls);
+	ec_encode_data_base(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
+
+	// Start encode test
+	perf_start(&start);
+	for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
+		// Make parity vects
+		ec_init_tables(k, m - k, &a[k * k], g_tbls);
+		ec_encode_data_base(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
+	}
+	perf_stop(&stop);
+	printf("erasure_code_base_encode" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
+
+	// Start decode test
+	perf_start(&start);
+	for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
+		// Construct b by removing error rows
+		for (i = 0, r = 0; i < k; i++, r++) {
+			while (src_in_err[r])
+				r++;
+			recov[i] = buffs[r];
+			for (j = 0; j < k; j++)
+				b[k * i + j] = a[k * r + j];
+		}
+
+		if (gf_invert_matrix(b, d, k) < 0) {
+			printf("BAD MATRIX\n");
+			return -1;
+		}
+
+		for (i = 0; i < nerrs; i++)
+			for (j = 0; j < k; j++)
+				c[k * i + j] = d[k * src_err_list[i] + j];
+
+		// Recover data
+		ec_init_tables(k, nerrs, c, g_tbls);
+		ec_encode_data_base(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs);
+	}
+	perf_stop(&stop);
+
+	for (i = 0; i < nerrs; i++) {
+		if (0 != memcmp(temp_buffs[i], buffs[src_err_list[i]], TEST_LEN(m))) {
+			printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+			return -1;
+		}
+	}
+
+	printf("erasure_code_base_decode" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)(TEST_LEN(m)) * (k + nerrs) * rtest);
+
+	printf("done all: Pass\n");
+	return 0;
+}
--- a/erasure_code/erasure_code_base_test.c
+++ b/erasure_code/erasure_code_base_test.c
@ -0,0 +1,764 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  127
+#endif
+#ifndef RANDOMS
+# define RANDOMS 50
+#endif
+
+#define MMAX TEST_SOURCES
+#define KMAX TEST_SOURCES
+
+#define EFENCE_TEST_MIN_SIZE 16
+
+#ifdef EC_ALIGNED_ADDR
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 0
+# define LEN_ALIGN_CHK_B 0	// 0 for aligned only
+#else
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 32
+# define LEN_ALIGN_CHK_B 32	// 0 for aligned only
+#endif
+
+#ifndef TEST_SEED
+#define TEST_SEED 11
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+// Generate Random errors
+static void gen_err_list(unsigned char *src_err_list,
+			 unsigned char *src_in_err, int *pnerrs, int *pnsrcerrs, int k, int m)
+{
+	int i, err;
+	int nerrs = 0, nsrcerrs = 0;
+
+	for (i = 0, nerrs = 0, nsrcerrs = 0; i < m && nerrs < m - k; i++) {
+		err = 1 & rand();
+		src_in_err[i] = err;
+		if (err) {
+			src_err_list[nerrs++] = i;
+			if (i < k) {
+				nsrcerrs++;
+			}
+		}
+	}
+	if (nerrs == 0) {	// should have at least one error
+		while ((err = (rand() % KMAX)) >= m) ;
+		src_err_list[nerrs++] = err;
+		src_in_err[err] = 1;
+		if (err < k)
+			nsrcerrs = 1;
+	}
+	*pnerrs = nerrs;
+	*pnsrcerrs = nsrcerrs;
+	return;
+}
+
+#define NO_INVERT_MATRIX -2
+// Generate decode matrix from encode matrix
+static int gf_gen_decode_matrix(unsigned char *encode_matrix,
+				unsigned char *decode_matrix,
+				unsigned char *invert_matrix,
+				unsigned int *decode_index,
+				unsigned char *src_err_list,
+				unsigned char *src_in_err,
+				int nerrs, int nsrcerrs, int k, int m)
+{
+	int i, j, p;
+	int r;
+	unsigned char *backup, *b, s;
+	int incr = 0;
+
+	b = malloc(MMAX * KMAX);
+	backup = malloc(MMAX * KMAX);
+
+	if (b == NULL || backup == NULL) {
+		printf("Test failure! Error with malloc\n");
+		free(b);
+		free(backup);
+		return -1;
+	}
+	// Construct matrix b by removing error rows
+	for (i = 0, r = 0; i < k; i++, r++) {
+		while (src_in_err[r])
+			r++;
+		for (j = 0; j < k; j++) {
+			b[k * i + j] = encode_matrix[k * r + j];
+			backup[k * i + j] = encode_matrix[k * r + j];
+		}
+		decode_index[i] = r;
+	}
+	incr = 0;
+	while (gf_invert_matrix(b, invert_matrix, k) < 0) {
+		if (nerrs == (m - k)) {
+			free(b);
+			free(backup);
+			printf("BAD MATRIX\n");
+			return NO_INVERT_MATRIX;
+		}
+		incr++;
+		memcpy(b, backup, MMAX * KMAX);
+		for (i = nsrcerrs; i < nerrs - nsrcerrs; i++) {
+			if (src_err_list[i] == (decode_index[k - 1] + incr)) {
+				// skip the erased parity line
+				incr++;
+				continue;
+			}
+		}
+		if (decode_index[k - 1] + incr >= m) {
+			free(b);
+			free(backup);
+			printf("BAD MATRIX\n");
+			return NO_INVERT_MATRIX;
+		}
+		decode_index[k - 1] += incr;
+		for (j = 0; j < k; j++)
+			b[k * (k - 1) + j] = encode_matrix[k * decode_index[k - 1] + j];
+
+	};
+
+	for (i = 0; i < nsrcerrs; i++) {
+		for (j = 0; j < k; j++) {
+			decode_matrix[k * i + j] = invert_matrix[k * src_err_list[i] + j];
+		}
+	}
+	/* src_err_list from encode_matrix * invert of b for parity decoding */
+	for (p = nsrcerrs; p < nerrs; p++) {
+		for (i = 0; i < k; i++) {
+			s = 0;
+			for (j = 0; j < k; j++)
+				s ^= gf_mul(invert_matrix[j * k + i],
+					    encode_matrix[k * src_err_list[p] + j]);
+
+			decode_matrix[k * p + i] = s;
+		}
+	}
+	free(b);
+	free(backup);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int re = 0;
+	int i, j, p, rtest, m, k;
+	int nerrs, nsrcerrs;
+	void *buf;
+	unsigned int decode_index[MMAX];
+	unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
+	unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
+	unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
+	unsigned char *recov[TEST_SOURCES];
+
+	int rows, align, size;
+	unsigned char *efence_buffs[TEST_SOURCES];
+	unsigned int offset;
+	u8 *ubuffs[TEST_SOURCES];
+	u8 *temp_ubuffs[TEST_SOURCES];
+
+	printf("erasure_code_base_test: %dx%d ", TEST_SOURCES, TEST_LEN);
+	srand(TEST_SEED);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		temp_buffs[i] = buf;
+	}
+
+	// Test erasure code by encode and recovery
+
+	encode_matrix = malloc(MMAX * KMAX);
+	decode_matrix = malloc(MMAX * KMAX);
+	invert_matrix = malloc(MMAX * KMAX);
+	g_tbls = malloc(KMAX * TEST_SOURCES * 32);
+	if (encode_matrix == NULL || decode_matrix == NULL
+	    || invert_matrix == NULL || g_tbls == NULL) {
+		printf("Test failure! Error with malloc\n");
+		return -1;
+	}
+	// Pick a first test
+	m = 9;
+	k = 5;
+	if (m > MMAX || k > KMAX)
+		return -1;
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	// Generate encode matrix encode_matrix
+	// The matrix generated by gf_gen_rs_matrix
+	// is not always invertable.
+	gf_gen_rs_matrix(encode_matrix, m, k);
+
+	// Generate g_tbls from encode matrix encode_matrix
+	ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+
+	// Perform matrix dot_prod for EC encoding
+	// using g_tbls from encode matrix encode_matrix
+	ec_encode_data_base(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
+
+	// Choose random buffers to be in erasure
+	memset(src_in_err, 0, TEST_SOURCES);
+	gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+	// Generate decode matrix
+	re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+				  invert_matrix, decode_index, src_err_list, src_in_err,
+				  nerrs, nsrcerrs, k, m);
+	if (re != 0) {
+		printf("Fail to gf_gen_decode_matrix\n");
+		return -1;
+	}
+	// Pack recovery array as list of valid sources
+	// Its order must be the same as the order
+	// to generate matrix b in gf_gen_decode_matrix
+	for (i = 0; i < k; i++) {
+		recov[i] = buffs[decode_index[i]];
+	}
+
+	// Recover data
+	ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+	ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+	for (i = 0; i < nerrs; i++) {
+
+		if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
+			printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
+			printf(" - erase list = ");
+			for (j = 0; j < nerrs; j++)
+				printf(" %d", src_err_list[j]);
+			printf(" - Index = ");
+			for (p = 0; p < k; p++)
+				printf(" %d", decode_index[p]);
+			printf("\nencode_matrix:\n");
+			dump_u8xu8((u8 *) encode_matrix, m, k);
+			printf("inv b:\n");
+			dump_u8xu8((u8 *) invert_matrix, k, k);
+			printf("\ndecode_matrix:\n");
+			dump_u8xu8((u8 *) decode_matrix, m, k);
+			printf("recov %d:", src_err_list[i]);
+			dump(temp_buffs[k + i], 25);
+			printf("orig   :");
+			dump(buffs[src_err_list[i]], 25);
+			return -1;
+		}
+	}
+
+	// Pick a first test
+	m = 9;
+	k = 5;
+	if (m > MMAX || k > KMAX)
+		return -1;
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	// The matrix generated by gf_gen_cauchy1_matrix
+	// is always invertable.
+	gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+	// Generate g_tbls from encode matrix encode_matrix
+	ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+
+	// Perform matrix dot_prod for EC encoding
+	// using g_tbls from encode matrix encode_matrix
+	ec_encode_data_base(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
+
+	// Choose random buffers to be in erasure
+	memset(src_in_err, 0, TEST_SOURCES);
+	gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+	// Generate decode matrix
+	re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+				  invert_matrix, decode_index, src_err_list, src_in_err,
+				  nerrs, nsrcerrs, k, m);
+	if (re != 0) {
+		printf("Fail to gf_gen_decode_matrix\n");
+		return -1;
+	}
+	// Pack recovery array as list of valid sources
+	// Its order must be the same as the order
+	// to generate matrix b in gf_gen_decode_matrix
+	for (i = 0; i < k; i++) {
+		recov[i] = buffs[decode_index[i]];
+	}
+
+	// Recover data
+	ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+	ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+	for (i = 0; i < nerrs; i++) {
+
+		if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
+			printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
+			printf(" - erase list = ");
+			for (j = 0; j < nerrs; j++)
+				printf(" %d", src_err_list[j]);
+			printf(" - Index = ");
+			for (p = 0; p < k; p++)
+				printf(" %d", decode_index[p]);
+			printf("\nencode_matrix:\n");
+			dump_u8xu8((u8 *) encode_matrix, m, k);
+			printf("inv b:\n");
+			dump_u8xu8((u8 *) invert_matrix, k, k);
+			printf("\ndecode_matrix:\n");
+			dump_u8xu8((u8 *) decode_matrix, m, k);
+			printf("recov %d:", src_err_list[i]);
+			dump(temp_buffs[k + i], 25);
+			printf("orig   :");
+			dump(buffs[src_err_list[i]], 25);
+			return -1;
+		}
+	}
+
+	// Do more random tests
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		// Make random data
+		for (i = 0; i < k; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		// The matrix generated by gf_gen_cauchy1_matrix
+		// is always invertable.
+		gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+		// Make parity vects
+		// Generate g_tbls from encode matrix a
+		ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+		// Perform matrix dot_prod for EC encoding
+		// using g_tbls from encode matrix a
+		ec_encode_data_base(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+		// Generate decode matrix
+		re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+					  invert_matrix, decode_index, src_err_list,
+					  src_in_err, nerrs, nsrcerrs, k, m);
+		if (re != 0) {
+			printf("Fail to gf_gen_decode_matrix\n");
+			return -1;
+		}
+		// Pack recovery array as list of valid sources
+		// Its order must be the same as the order
+		// to generate matrix b in gf_gen_decode_matrix
+		for (i = 0; i < k; i++) {
+			recov[i] = buffs[decode_index[i]];
+		}
+
+		// Recover data
+		ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+		ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+
+		for (i = 0; i < nerrs; i++) {
+
+			if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (j = 0; j < nerrs; j++)
+					printf(" %d", src_err_list[j]);
+				printf(" - Index = ");
+				for (p = 0; p < k; p++)
+					printf(" %d", decode_index[p]);
+				printf("\nencode_matrix:\n");
+				dump_u8xu8((u8 *) encode_matrix, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((u8 *) invert_matrix, k, k);
+				printf("\ndecode_matrix:\n");
+				dump_u8xu8((u8 *) decode_matrix, m, k);
+				printf("orig data:\n");
+				dump_matrix(buffs, m, 25);
+				printf("orig   :");
+				dump(buffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_buffs[k + i], 25);
+				return -1;
+			}
+		}
+		putchar('.');
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	k = 16;
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+	if (k > KMAX)
+		return -1;
+
+	for (rows = 1; rows <= 16; rows++) {
+		m = k + rows;
+		if (m > MMAX)
+			return -1;
+
+		// Make random data
+		for (i = 0; i < k; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (size = EFENCE_TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
+			for (i = 0; i < m; i++) {	// Line up TEST_SIZE from end
+				efence_buffs[i] = buffs[i] + TEST_LEN - size;
+			}
+
+			// The matrix generated by gf_gen_cauchy1_matrix
+			// is always invertable.
+			gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+			// Make parity vects
+			// Generate g_tbls from encode matrix a
+			ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+			// Perform matrix dot_prod for EC encoding
+			// using g_tbls from encode matrix a
+			ec_encode_data_base(size, k, m - k, g_tbls, efence_buffs,
+					    &efence_buffs[k]);
+
+			// Random errors
+			memset(src_in_err, 0, TEST_SOURCES);
+			gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+			// Generate decode matrix
+			re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+						  invert_matrix, decode_index, src_err_list,
+						  src_in_err, nerrs, nsrcerrs, k, m);
+			if (re != 0) {
+				printf("Fail to gf_gen_decode_matrix\n");
+				return -1;
+			}
+			// Pack recovery array as list of valid sources
+			// Its order must be the same as the order
+			// to generate matrix b in gf_gen_decode_matrix
+			for (i = 0; i < k; i++) {
+				recov[i] = efence_buffs[decode_index[i]];
+			}
+
+			// Recover data
+			ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+			ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+
+			for (i = 0; i < nerrs; i++) {
+
+				if (0 !=
+				    memcmp(temp_buffs[k + i], efence_buffs[src_err_list[i]],
+					   size)) {
+					printf("Efence: Fail error recovery (%d, %d, %d)\n", m,
+					       k, nerrs);
+
+					printf("size = %d\n", size);
+
+					printf("Test erase list = ");
+					for (j = 0; j < nerrs; j++)
+						printf(" %d", src_err_list[j]);
+					printf(" - Index = ");
+					for (p = 0; p < k; p++)
+						printf(" %d", decode_index[p]);
+					printf("\nencode_matrix:\n");
+					dump_u8xu8((u8 *) encode_matrix, m, k);
+					printf("inv b:\n");
+					dump_u8xu8((u8 *) invert_matrix, k, k);
+					printf("\ndecode_matrix:\n");
+					dump_u8xu8((u8 *) decode_matrix, m, k);
+
+					printf("recov %d:", src_err_list[i]);
+					dump(temp_buffs[k + i], align);
+					printf("orig   :");
+					dump(efence_buffs[src_err_list[i]], align);
+					return -1;
+				}
+			}
+		}
+
+	}
+
+	// Test rand ptr alignment if available
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
+
+		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
+		// Add random offsets
+		for (i = 0; i < m; i++) {
+			memset(buffs[i], 0, TEST_LEN);	// zero pad to check write-over
+			memset(temp_buffs[i], 0, TEST_LEN);	// zero pad to check write-over
+			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+			temp_ubuffs[i] = temp_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+		}
+
+		for (i = 0; i < k; i++)
+			for (j = 0; j < size; j++)
+				ubuffs[i][j] = rand();
+
+		// The matrix generated by gf_gen_cauchy1_matrix
+		// is always invertable.
+		gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+		// Make parity vects
+		// Generate g_tbls from encode matrix a
+		ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+		// Perform matrix dot_prod for EC encoding
+		// using g_tbls from encode matrix a
+		ec_encode_data_base(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]);
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+		// Generate decode matrix
+		re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+					  invert_matrix, decode_index, src_err_list,
+					  src_in_err, nerrs, nsrcerrs, k, m);
+		if (re != 0) {
+			printf("Fail to gf_gen_decode_matrix\n");
+			return -1;
+		}
+		// Pack recovery array as list of valid sources
+		// Its order must be the same as the order
+		// to generate matrix b in gf_gen_decode_matrix
+		for (i = 0; i < k; i++) {
+			recov[i] = ubuffs[decode_index[i]];
+		}
+
+		// Recover data
+		ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+		ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_ubuffs[k]);
+
+		for (i = 0; i < nerrs; i++) {
+
+			if (0 != memcmp(temp_ubuffs[k + i], ubuffs[src_err_list[i]], size)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (j = 0; j < nerrs; j++)
+					printf(" %d", src_err_list[j]);
+				printf(" - Index = ");
+				for (p = 0; p < k; p++)
+					printf(" %d", decode_index[p]);
+				printf("\nencode_matrix:\n");
+				dump_u8xu8((unsigned char *)encode_matrix, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((unsigned char *)invert_matrix, k, k);
+				printf("\ndecode_matrix:\n");
+				dump_u8xu8((unsigned char *)decode_matrix, m, k);
+				printf("orig data:\n");
+				dump_matrix(ubuffs, m, 25);
+				printf("orig   :");
+				dump(ubuffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_ubuffs[k + i], 25);
+				return -1;
+			}
+		}
+
+		// Confirm that padding around dests is unchanged
+		memset(temp_buffs[0], 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
+
+		for (i = 0; i < m; i++) {
+
+			offset = ubuffs[i] - buffs[i];
+
+			if (memcmp(buffs[i], temp_buffs[0], offset)) {
+				printf("Fail rand ualign encode pad start\n");
+				return -1;
+			}
+			if (memcmp
+			    (buffs[i] + offset + size, temp_buffs[0],
+			     PTR_ALIGN_CHK_B - offset)) {
+				printf("Fail rand ualign encode pad end\n");
+				return -1;
+			}
+		}
+
+		for (i = 0; i < nerrs; i++) {
+
+			offset = temp_ubuffs[k + i] - temp_buffs[k + i];
+			if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
+				printf("Fail rand ualign decode pad start\n");
+				return -1;
+			}
+			if (memcmp
+			    (temp_buffs[k + i] + offset + size, temp_buffs[0],
+			     PTR_ALIGN_CHK_B - offset)) {
+				printf("Fail rand ualign decode pad end\n");
+				return -1;
+			}
+		}
+
+		putchar('.');
+	}
+
+	// Test size alignment
+
+	align = (LEN_ALIGN_CHK_B != 0) ? 13 : 16;
+
+	for (size = TEST_LEN; size > 0; size -= align) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		for (i = 0; i < k; i++)
+			for (j = 0; j < size; j++)
+				buffs[i][j] = rand();
+
+		// The matrix generated by gf_gen_cauchy1_matrix
+		// is always invertable.
+		gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+		// Make parity vects
+		// Generate g_tbls from encode matrix a
+		ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+		// Perform matrix dot_prod for EC encoding
+		// using g_tbls from encode matrix a
+		ec_encode_data_base(size, k, m - k, g_tbls, buffs, &buffs[k]);
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+		// Generate decode matrix
+		re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+					  invert_matrix, decode_index, src_err_list,
+					  src_in_err, nerrs, nsrcerrs, k, m);
+		if (re != 0) {
+			printf("Fail to gf_gen_decode_matrix\n");
+			return -1;
+		}
+		// Pack recovery array as list of valid sources
+		// Its order must be the same as the order
+		// to generate matrix b in gf_gen_decode_matrix
+		for (i = 0; i < k; i++) {
+			recov[i] = buffs[decode_index[i]];
+		}
+
+		// Recover data
+		ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+		ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+
+		for (i = 0; i < nerrs; i++) {
+
+			if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], size)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (j = 0; j < nerrs; j++)
+					printf(" %d", src_err_list[j]);
+				printf(" - Index = ");
+				for (p = 0; p < k; p++)
+					printf(" %d", decode_index[p]);
+				printf("\nencode_matrix:\n");
+				dump_u8xu8((unsigned char *)encode_matrix, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((unsigned char *)invert_matrix, k, k);
+				printf("\ndecode_matrix:\n");
+				dump_u8xu8((unsigned char *)decode_matrix, m, k);
+				printf("orig data:\n");
+				dump_matrix(buffs, m, 25);
+				printf("orig   :");
+				dump(buffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_buffs[k + i], 25);
+				return -1;
+			}
+		}
+	}
+
+	printf("done EC tests: Pass\n");
+	return 0;
+}
--- a/erasure_code/erasure_code_perf.c
+++ b/erasure_code/erasure_code_perf.c
@ -0,0 +1,168 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 32
+# define TEST_LEN(m)  ((128*1024 / m) & ~(64-1))
+# define TEST_LOOPS(m)   (10000*m)
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 32
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN(m)  ((GT_L3_CACHE / m) & ~(64-1))
+#  define TEST_LOOPS(m)   (50*m)
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS(m) 1000
+#  endif
+# endif
+#endif
+
+#define MMAX TEST_SOURCES
+#define KMAX TEST_SOURCES
+
+typedef unsigned char u8;
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, m, k, nerrs, r;
+	void *buf;
+	u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
+	u8 a[MMAX * KMAX], b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX];
+	u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
+	u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
+	struct perf start, stop;
+
+	// Pick test parameters
+	m = 14;
+	k = 10;
+	nerrs = 4;
+	const u8 err_list[] = { 2, 4, 5, 7 };
+
+	printf("erasure_code_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
+
+	if (m > MMAX || k > KMAX || nerrs > (m - k)) {
+		printf(" Input test parameter error\n");
+		return -1;
+	}
+
+	memcpy(src_err_list, err_list, nerrs);
+	memset(src_in_err, 0, TEST_SOURCES);
+	for (i = 0; i < nerrs; i++)
+		src_in_err[src_err_list[i]] = 1;
+
+	// Allocate the arrays
+	for (i = 0; i < m; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN(m))) {
+			printf("alloc error: Fail\n");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	for (i = 0; i < (m - k); i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN(m))) {
+			printf("alloc error: Fail\n");
+			return -1;
+		}
+		temp_buffs[i] = buf;
+	}
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN(m); j++)
+			buffs[i][j] = rand();
+
+	gf_gen_rs_matrix(a, m, k);
+	ec_init_tables(k, m - k, &a[k * k], g_tbls);
+	ec_encode_data(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
+
+	// Start encode test
+	perf_start(&start);
+	for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
+		// Make parity vects
+		ec_init_tables(k, m - k, &a[k * k], g_tbls);
+		ec_encode_data(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
+	}
+	perf_stop(&stop);
+	printf("erasure_code_encode" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
+
+	// Start decode test
+	perf_start(&start);
+	for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
+		// Construct b by removing error rows
+		for (i = 0, r = 0; i < k; i++, r++) {
+			while (src_in_err[r])
+				r++;
+			recov[i] = buffs[r];
+			for (j = 0; j < k; j++)
+				b[k * i + j] = a[k * r + j];
+		}
+
+		if (gf_invert_matrix(b, d, k) < 0) {
+			printf("BAD MATRIX\n");
+			return -1;
+		}
+
+		for (i = 0; i < nerrs; i++)
+			for (j = 0; j < k; j++)
+				c[k * i + j] = d[k * src_err_list[i] + j];
+
+		// Recover data
+		ec_init_tables(k, nerrs, c, g_tbls);
+		ec_encode_data(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs);
+	}
+	perf_stop(&stop);
+
+	for (i = 0; i < nerrs; i++) {
+		if (0 != memcmp(temp_buffs[i], buffs[src_err_list[i]], TEST_LEN(m))) {
+			printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+			return -1;
+		}
+	}
+
+	printf("erasure_code_decode" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)(TEST_LEN(m)) * (k + nerrs) * rtest);
+
+	printf("done all: Pass\n");
+	return 0;
+}
--- a/erasure_code/erasure_code_sse_perf.c
+++ b/erasure_code/erasure_code_sse_perf.c
@ -0,0 +1,168 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 32
+# define TEST_LEN(m)  ((128*1024 / m) & ~(64-1))
+# define TEST_LOOPS(m)   (10000*m)
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 32
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN(m)  ((GT_L3_CACHE / m) & ~(64-1))
+#  define TEST_LOOPS(m)   (50*m)
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS(m) 1000
+#  endif
+# endif
+#endif
+
+#define MMAX TEST_SOURCES
+#define KMAX TEST_SOURCES
+
+typedef unsigned char u8;
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, m, k, nerrs, r;
+	void *buf;
+	u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
+	u8 a[MMAX * KMAX], b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX];
+	u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
+	u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
+	struct perf start, stop;
+
+	// Pick test parameters
+	m = 14;
+	k = 10;
+	nerrs = 4;
+	const u8 err_list[] = { 2, 4, 5, 7 };
+
+	printf("erasure_code_sse_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
+
+	if (m > MMAX || k > KMAX || nerrs > (m - k)) {
+		printf(" Input test parameter error\n");
+		return -1;
+	}
+
+	memcpy(src_err_list, err_list, nerrs);
+	memset(src_in_err, 0, TEST_SOURCES);
+	for (i = 0; i < nerrs; i++)
+		src_in_err[src_err_list[i]] = 1;
+
+	// Allocate the arrays
+	for (i = 0; i < m; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN(m))) {
+			printf("alloc error: Fail\n");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	for (i = 0; i < (m - k); i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN(m))) {
+			printf("alloc error: Fail\n");
+			return -1;
+		}
+		temp_buffs[i] = buf;
+	}
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN(m); j++)
+			buffs[i][j] = rand();
+
+	gf_gen_rs_matrix(a, m, k);
+	ec_init_tables(k, m - k, &a[k * k], g_tbls);
+	ec_encode_data_sse(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
+
+	// Start encode test
+	perf_start(&start);
+	for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
+		// Make parity vects
+		ec_init_tables(k, m - k, &a[k * k], g_tbls);
+		ec_encode_data_sse(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
+	}
+	perf_stop(&stop);
+	printf("erasure_code_sse_encode" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
+
+	// Start decode test
+	perf_start(&start);
+	for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
+		// Construct b by removing error rows
+		for (i = 0, r = 0; i < k; i++, r++) {
+			while (src_in_err[r])
+				r++;
+			recov[i] = buffs[r];
+			for (j = 0; j < k; j++)
+				b[k * i + j] = a[k * r + j];
+		}
+
+		if (gf_invert_matrix(b, d, k) < 0) {
+			printf("BAD MATRIX\n");
+			return -1;
+		}
+
+		for (i = 0; i < nerrs; i++)
+			for (j = 0; j < k; j++)
+				c[k * i + j] = d[k * src_err_list[i] + j];
+
+		// Recover data
+		ec_init_tables(k, nerrs, c, g_tbls);
+		ec_encode_data_sse(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs);
+	}
+	perf_stop(&stop);
+
+	for (i = 0; i < nerrs; i++) {
+		if (0 != memcmp(temp_buffs[i], buffs[src_err_list[i]], TEST_LEN(m))) {
+			printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+			return -1;
+		}
+	}
+
+	printf("erasure_code_sse_decode" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)(TEST_LEN(m)) * (k + nerrs) * rtest);
+
+	printf("done all: Pass\n");
+	return 0;
+}
--- a/erasure_code/erasure_code_sse_test.c
+++ b/erasure_code/erasure_code_sse_test.c
@ -0,0 +1,764 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  127
+#endif
+#ifndef RANDOMS
+# define RANDOMS 200
+#endif
+
+#define MMAX TEST_SOURCES
+#define KMAX TEST_SOURCES
+
+#define EFENCE_TEST_MIN_SIZE 16
+
+#ifdef EC_ALIGNED_ADDR
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 0
+# define LEN_ALIGN_CHK_B 0	// 0 for aligned only
+#else
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 32
+# define LEN_ALIGN_CHK_B 32	// 0 for aligned only
+#endif
+
+#ifndef TEST_SEED
+#define TEST_SEED 11
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+// Generate Random errors
+static void gen_err_list(unsigned char *src_err_list,
+			 unsigned char *src_in_err, int *pnerrs, int *pnsrcerrs, int k, int m)
+{
+	int i, err;
+	int nerrs = 0, nsrcerrs = 0;
+
+	for (i = 0, nerrs = 0, nsrcerrs = 0; i < m && nerrs < m - k; i++) {
+		err = 1 & rand();
+		src_in_err[i] = err;
+		if (err) {
+			src_err_list[nerrs++] = i;
+			if (i < k) {
+				nsrcerrs++;
+			}
+		}
+	}
+	if (nerrs == 0) {	// should have at least one error
+		while ((err = (rand() % KMAX)) >= m) ;
+		src_err_list[nerrs++] = err;
+		src_in_err[err] = 1;
+		if (err < k)
+			nsrcerrs = 1;
+	}
+	*pnerrs = nerrs;
+	*pnsrcerrs = nsrcerrs;
+	return;
+}
+
+#define NO_INVERT_MATRIX -2
+// Generate decode matrix from encode matrix
+static int gf_gen_decode_matrix(unsigned char *encode_matrix,
+				unsigned char *decode_matrix,
+				unsigned char *invert_matrix,
+				unsigned int *decode_index,
+				unsigned char *src_err_list,
+				unsigned char *src_in_err,
+				int nerrs, int nsrcerrs, int k, int m)
+{
+	int i, j, p;
+	int r;
+	unsigned char *backup, *b, s;
+	int incr = 0;
+
+	b = malloc(MMAX * KMAX);
+	backup = malloc(MMAX * KMAX);
+
+	if (b == NULL || backup == NULL) {
+		printf("Test failure! Error with malloc\n");
+		free(b);
+		free(backup);
+		return -1;
+	}
+	// Construct matrix b by removing error rows
+	for (i = 0, r = 0; i < k; i++, r++) {
+		while (src_in_err[r])
+			r++;
+		for (j = 0; j < k; j++) {
+			b[k * i + j] = encode_matrix[k * r + j];
+			backup[k * i + j] = encode_matrix[k * r + j];
+		}
+		decode_index[i] = r;
+	}
+	incr = 0;
+	while (gf_invert_matrix(b, invert_matrix, k) < 0) {
+		if (nerrs == (m - k)) {
+			free(b);
+			free(backup);
+			printf("BAD MATRIX\n");
+			return NO_INVERT_MATRIX;
+		}
+		incr++;
+		memcpy(b, backup, MMAX * KMAX);
+		for (i = nsrcerrs; i < nerrs - nsrcerrs; i++) {
+			if (src_err_list[i] == (decode_index[k - 1] + incr)) {
+				// skip the erased parity line
+				incr++;
+				continue;
+			}
+		}
+		if (decode_index[k - 1] + incr >= m) {
+			free(b);
+			free(backup);
+			printf("BAD MATRIX\n");
+			return NO_INVERT_MATRIX;
+		}
+		decode_index[k - 1] += incr;
+		for (j = 0; j < k; j++)
+			b[k * (k - 1) + j] = encode_matrix[k * decode_index[k - 1] + j];
+
+	};
+
+	for (i = 0; i < nsrcerrs; i++) {
+		for (j = 0; j < k; j++) {
+			decode_matrix[k * i + j] = invert_matrix[k * src_err_list[i] + j];
+		}
+	}
+	/* src_err_list from encode_matrix * invert of b for parity decoding */
+	for (p = nsrcerrs; p < nerrs; p++) {
+		for (i = 0; i < k; i++) {
+			s = 0;
+			for (j = 0; j < k; j++)
+				s ^= gf_mul(invert_matrix[j * k + i],
+					    encode_matrix[k * src_err_list[p] + j]);
+
+			decode_matrix[k * p + i] = s;
+		}
+	}
+	free(b);
+	free(backup);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int re = 0;
+	int i, j, p, rtest, m, k;
+	int nerrs, nsrcerrs;
+	void *buf;
+	unsigned int decode_index[MMAX];
+	unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
+	unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
+	unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
+	unsigned char *recov[TEST_SOURCES];
+
+	int rows, align, size;
+	unsigned char *efence_buffs[TEST_SOURCES];
+	unsigned int offset;
+	u8 *ubuffs[TEST_SOURCES];
+	u8 *temp_ubuffs[TEST_SOURCES];
+
+	printf("erasure_code_sse_test: %dx%d ", TEST_SOURCES, TEST_LEN);
+	srand(TEST_SEED);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		temp_buffs[i] = buf;
+	}
+
+	// Test erasure code by encode and recovery
+
+	encode_matrix = malloc(MMAX * KMAX);
+	decode_matrix = malloc(MMAX * KMAX);
+	invert_matrix = malloc(MMAX * KMAX);
+	g_tbls = malloc(KMAX * TEST_SOURCES * 32);
+	if (encode_matrix == NULL || decode_matrix == NULL
+	    || invert_matrix == NULL || g_tbls == NULL) {
+		printf("Test failure! Error with malloc\n");
+		return -1;
+	}
+	// Pick a first test
+	m = 9;
+	k = 5;
+	if (m > MMAX || k > KMAX)
+		return -1;
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	// Generate encode matrix encode_matrix
+	// The matrix generated by gf_gen_rs_matrix
+	// is not always invertable.
+	gf_gen_rs_matrix(encode_matrix, m, k);
+
+	// Generate g_tbls from encode matrix encode_matrix
+	ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+
+	// Perform matrix dot_prod for EC encoding
+	// using g_tbls from encode matrix encode_matrix
+	ec_encode_data_sse(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
+
+	// Choose random buffers to be in erasure
+	memset(src_in_err, 0, TEST_SOURCES);
+	gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+	// Generate decode matrix
+	re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+				  invert_matrix, decode_index, src_err_list, src_in_err,
+				  nerrs, nsrcerrs, k, m);
+	if (re != 0) {
+		printf("Fail to gf_gen_decode_matrix\n");
+		return -1;
+	}
+	// Pack recovery array as list of valid sources
+	// Its order must be the same as the order
+	// to generate matrix b in gf_gen_decode_matrix
+	for (i = 0; i < k; i++) {
+		recov[i] = buffs[decode_index[i]];
+	}
+
+	// Recover data
+	ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+	ec_encode_data_sse(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+	for (i = 0; i < nerrs; i++) {
+
+		if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
+			printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
+			printf(" - erase list = ");
+			for (j = 0; j < nerrs; j++)
+				printf(" %d", src_err_list[j]);
+			printf(" - Index = ");
+			for (p = 0; p < k; p++)
+				printf(" %d", decode_index[p]);
+			printf("\nencode_matrix:\n");
+			dump_u8xu8((u8 *) encode_matrix, m, k);
+			printf("inv b:\n");
+			dump_u8xu8((u8 *) invert_matrix, k, k);
+			printf("\ndecode_matrix:\n");
+			dump_u8xu8((u8 *) decode_matrix, m, k);
+			printf("recov %d:", src_err_list[i]);
+			dump(temp_buffs[k + i], 25);
+			printf("orig   :");
+			dump(buffs[src_err_list[i]], 25);
+			return -1;
+		}
+	}
+
+	// Pick a first test
+	m = 9;
+	k = 5;
+	if (m > MMAX || k > KMAX)
+		return -1;
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	// The matrix generated by gf_gen_cauchy1_matrix
+	// is always invertable.
+	gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+	// Generate g_tbls from encode matrix encode_matrix
+	ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+
+	// Perform matrix dot_prod for EC encoding
+	// using g_tbls from encode matrix encode_matrix
+	ec_encode_data_sse(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
+
+	// Choose random buffers to be in erasure
+	memset(src_in_err, 0, TEST_SOURCES);
+	gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+	// Generate decode matrix
+	re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+				  invert_matrix, decode_index, src_err_list, src_in_err,
+				  nerrs, nsrcerrs, k, m);
+	if (re != 0) {
+		printf("Fail to gf_gen_decode_matrix\n");
+		return -1;
+	}
+	// Pack recovery array as list of valid sources
+	// Its order must be the same as the order
+	// to generate matrix b in gf_gen_decode_matrix
+	for (i = 0; i < k; i++) {
+		recov[i] = buffs[decode_index[i]];
+	}
+
+	// Recover data
+	ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+	ec_encode_data_sse(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+	for (i = 0; i < nerrs; i++) {
+
+		if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
+			printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
+			printf(" - erase list = ");
+			for (j = 0; j < nerrs; j++)
+				printf(" %d", src_err_list[j]);
+			printf(" - Index = ");
+			for (p = 0; p < k; p++)
+				printf(" %d", decode_index[p]);
+			printf("\nencode_matrix:\n");
+			dump_u8xu8((u8 *) encode_matrix, m, k);
+			printf("inv b:\n");
+			dump_u8xu8((u8 *) invert_matrix, k, k);
+			printf("\ndecode_matrix:\n");
+			dump_u8xu8((u8 *) decode_matrix, m, k);
+			printf("recov %d:", src_err_list[i]);
+			dump(temp_buffs[k + i], 25);
+			printf("orig   :");
+			dump(buffs[src_err_list[i]], 25);
+			return -1;
+		}
+	}
+
+	// Do more random tests
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		// Make random data
+		for (i = 0; i < k; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		// The matrix generated by gf_gen_cauchy1_matrix
+		// is always invertable.
+		gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+		// Make parity vects
+		// Generate g_tbls from encode matrix a
+		ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+		// Perform matrix dot_prod for EC encoding
+		// using g_tbls from encode matrix a
+		ec_encode_data_sse(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+		// Generate decode matrix
+		re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+					  invert_matrix, decode_index, src_err_list,
+					  src_in_err, nerrs, nsrcerrs, k, m);
+		if (re != 0) {
+			printf("Fail to gf_gen_decode_matrix\n");
+			return -1;
+		}
+		// Pack recovery array as list of valid sources
+		// Its order must be the same as the order
+		// to generate matrix b in gf_gen_decode_matrix
+		for (i = 0; i < k; i++) {
+			recov[i] = buffs[decode_index[i]];
+		}
+
+		// Recover data
+		ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+		ec_encode_data_sse(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+
+		for (i = 0; i < nerrs; i++) {
+
+			if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (j = 0; j < nerrs; j++)
+					printf(" %d", src_err_list[j]);
+				printf(" - Index = ");
+				for (p = 0; p < k; p++)
+					printf(" %d", decode_index[p]);
+				printf("\nencode_matrix:\n");
+				dump_u8xu8((u8 *) encode_matrix, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((u8 *) invert_matrix, k, k);
+				printf("\ndecode_matrix:\n");
+				dump_u8xu8((u8 *) decode_matrix, m, k);
+				printf("orig data:\n");
+				dump_matrix(buffs, m, 25);
+				printf("orig   :");
+				dump(buffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_buffs[k + i], 25);
+				return -1;
+			}
+		}
+		putchar('.');
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	k = 16;
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+	if (k > KMAX)
+		return -1;
+
+	for (rows = 1; rows <= 16; rows++) {
+		m = k + rows;
+		if (m > MMAX)
+			return -1;
+
+		// Make random data
+		for (i = 0; i < k; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (size = EFENCE_TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
+			for (i = 0; i < m; i++) {	// Line up TEST_SIZE from end
+				efence_buffs[i] = buffs[i] + TEST_LEN - size;
+			}
+
+			// The matrix generated by gf_gen_cauchy1_matrix
+			// is always invertable.
+			gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+			// Make parity vects
+			// Generate g_tbls from encode matrix a
+			ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+			// Perform matrix dot_prod for EC encoding
+			// using g_tbls from encode matrix a
+			ec_encode_data_sse(size, k, m - k, g_tbls, efence_buffs,
+					   &efence_buffs[k]);
+
+			// Random errors
+			memset(src_in_err, 0, TEST_SOURCES);
+			gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+			// Generate decode matrix
+			re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+						  invert_matrix, decode_index, src_err_list,
+						  src_in_err, nerrs, nsrcerrs, k, m);
+			if (re != 0) {
+				printf("Fail to gf_gen_decode_matrix\n");
+				return -1;
+			}
+			// Pack recovery array as list of valid sources
+			// Its order must be the same as the order
+			// to generate matrix b in gf_gen_decode_matrix
+			for (i = 0; i < k; i++) {
+				recov[i] = efence_buffs[decode_index[i]];
+			}
+
+			// Recover data
+			ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+			ec_encode_data_sse(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+
+			for (i = 0; i < nerrs; i++) {
+
+				if (0 !=
+				    memcmp(temp_buffs[k + i], efence_buffs[src_err_list[i]],
+					   size)) {
+					printf("Efence: Fail error recovery (%d, %d, %d)\n", m,
+					       k, nerrs);
+
+					printf("size = %d\n", size);
+
+					printf("Test erase list = ");
+					for (j = 0; j < nerrs; j++)
+						printf(" %d", src_err_list[j]);
+					printf(" - Index = ");
+					for (p = 0; p < k; p++)
+						printf(" %d", decode_index[p]);
+					printf("\nencode_matrix:\n");
+					dump_u8xu8((u8 *) encode_matrix, m, k);
+					printf("inv b:\n");
+					dump_u8xu8((u8 *) invert_matrix, k, k);
+					printf("\ndecode_matrix:\n");
+					dump_u8xu8((u8 *) decode_matrix, m, k);
+
+					printf("recov %d:", src_err_list[i]);
+					dump(temp_buffs[k + i], align);
+					printf("orig   :");
+					dump(efence_buffs[src_err_list[i]], align);
+					return -1;
+				}
+			}
+		}
+
+	}
+
+	// Test rand ptr alignment if available
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
+
+		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
+		// Add random offsets
+		for (i = 0; i < m; i++) {
+			memset(buffs[i], 0, TEST_LEN);	// zero pad to check write-over
+			memset(temp_buffs[i], 0, TEST_LEN);	// zero pad to check write-over
+			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+			temp_ubuffs[i] = temp_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+		}
+
+		for (i = 0; i < k; i++)
+			for (j = 0; j < size; j++)
+				ubuffs[i][j] = rand();
+
+		// The matrix generated by gf_gen_cauchy1_matrix
+		// is always invertable.
+		gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+		// Make parity vects
+		// Generate g_tbls from encode matrix a
+		ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+		// Perform matrix dot_prod for EC encoding
+		// using g_tbls from encode matrix a
+		ec_encode_data_sse(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]);
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+		// Generate decode matrix
+		re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+					  invert_matrix, decode_index, src_err_list,
+					  src_in_err, nerrs, nsrcerrs, k, m);
+		if (re != 0) {
+			printf("Fail to gf_gen_decode_matrix\n");
+			return -1;
+		}
+		// Pack recovery array as list of valid sources
+		// Its order must be the same as the order
+		// to generate matrix b in gf_gen_decode_matrix
+		for (i = 0; i < k; i++) {
+			recov[i] = ubuffs[decode_index[i]];
+		}
+
+		// Recover data
+		ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+		ec_encode_data_sse(size, k, nerrs, g_tbls, recov, &temp_ubuffs[k]);
+
+		for (i = 0; i < nerrs; i++) {
+
+			if (0 != memcmp(temp_ubuffs[k + i], ubuffs[src_err_list[i]], size)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (j = 0; j < nerrs; j++)
+					printf(" %d", src_err_list[j]);
+				printf(" - Index = ");
+				for (p = 0; p < k; p++)
+					printf(" %d", decode_index[p]);
+				printf("\nencode_matrix:\n");
+				dump_u8xu8((unsigned char *)encode_matrix, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((unsigned char *)invert_matrix, k, k);
+				printf("\ndecode_matrix:\n");
+				dump_u8xu8((unsigned char *)decode_matrix, m, k);
+				printf("orig data:\n");
+				dump_matrix(ubuffs, m, 25);
+				printf("orig   :");
+				dump(ubuffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_ubuffs[k + i], 25);
+				return -1;
+			}
+		}
+
+		// Confirm that padding around dests is unchanged
+		memset(temp_buffs[0], 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
+
+		for (i = 0; i < m; i++) {
+
+			offset = ubuffs[i] - buffs[i];
+
+			if (memcmp(buffs[i], temp_buffs[0], offset)) {
+				printf("Fail rand ualign encode pad start\n");
+				return -1;
+			}
+			if (memcmp
+			    (buffs[i] + offset + size, temp_buffs[0],
+			     PTR_ALIGN_CHK_B - offset)) {
+				printf("Fail rand ualign encode pad end\n");
+				return -1;
+			}
+		}
+
+		for (i = 0; i < nerrs; i++) {
+
+			offset = temp_ubuffs[k + i] - temp_buffs[k + i];
+			if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
+				printf("Fail rand ualign decode pad start\n");
+				return -1;
+			}
+			if (memcmp
+			    (temp_buffs[k + i] + offset + size, temp_buffs[0],
+			     PTR_ALIGN_CHK_B - offset)) {
+				printf("Fail rand ualign decode pad end\n");
+				return -1;
+			}
+		}
+
+		putchar('.');
+	}
+
+	// Test size alignment
+
+	align = (LEN_ALIGN_CHK_B != 0) ? 13 : 16;
+
+	for (size = TEST_LEN; size > 0; size -= align) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		for (i = 0; i < k; i++)
+			for (j = 0; j < size; j++)
+				buffs[i][j] = rand();
+
+		// The matrix generated by gf_gen_cauchy1_matrix
+		// is always invertable.
+		gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+		// Make parity vects
+		// Generate g_tbls from encode matrix a
+		ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+		// Perform matrix dot_prod for EC encoding
+		// using g_tbls from encode matrix a
+		ec_encode_data_sse(size, k, m - k, g_tbls, buffs, &buffs[k]);
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+		// Generate decode matrix
+		re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+					  invert_matrix, decode_index, src_err_list,
+					  src_in_err, nerrs, nsrcerrs, k, m);
+		if (re != 0) {
+			printf("Fail to gf_gen_decode_matrix\n");
+			return -1;
+		}
+		// Pack recovery array as list of valid sources
+		// Its order must be the same as the order
+		// to generate matrix b in gf_gen_decode_matrix
+		for (i = 0; i < k; i++) {
+			recov[i] = buffs[decode_index[i]];
+		}
+
+		// Recover data
+		ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+		ec_encode_data_sse(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+
+		for (i = 0; i < nerrs; i++) {
+
+			if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], size)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (j = 0; j < nerrs; j++)
+					printf(" %d", src_err_list[j]);
+				printf(" - Index = ");
+				for (p = 0; p < k; p++)
+					printf(" %d", decode_index[p]);
+				printf("\nencode_matrix:\n");
+				dump_u8xu8((unsigned char *)encode_matrix, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((unsigned char *)invert_matrix, k, k);
+				printf("\ndecode_matrix:\n");
+				dump_u8xu8((unsigned char *)decode_matrix, m, k);
+				printf("orig data:\n");
+				dump_matrix(buffs, m, 25);
+				printf("orig   :");
+				dump(buffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_buffs[k + i], 25);
+				return -1;
+			}
+		}
+	}
+
+	printf("done EC tests: Pass\n");
+	return 0;
+}
--- a/erasure_code/erasure_code_test.c
+++ b/erasure_code/erasure_code_test.c
@ -0,0 +1,763 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  127
+#endif
+#ifndef RANDOMS
+# define RANDOMS 200
+#endif
+
+#define MMAX TEST_SOURCES
+#define KMAX TEST_SOURCES
+
+#define EFENCE_TEST_MIN_SIZE 16
+
+#ifdef EC_ALIGNED_ADDR
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 0
+# define LEN_ALIGN_CHK_B 0	// 0 for aligned only
+#else
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 32
+# define LEN_ALIGN_CHK_B 32	// 0 for aligned only
+#endif
+
+#ifndef TEST_SEED
+#define TEST_SEED 11
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+// Generate Random errors
+static void gen_err_list(unsigned char *src_err_list,
+			 unsigned char *src_in_err, int *pnerrs, int *pnsrcerrs, int k, int m)
+{
+	int i, err;
+	int nerrs = 0, nsrcerrs = 0;
+
+	for (i = 0, nerrs = 0, nsrcerrs = 0; i < m && nerrs < m - k; i++) {
+		err = 1 & rand();
+		src_in_err[i] = err;
+		if (err) {
+			src_err_list[nerrs++] = i;
+			if (i < k) {
+				nsrcerrs++;
+			}
+		}
+	}
+	if (nerrs == 0) {	// should have at least one error
+		while ((err = (rand() % KMAX)) >= m) ;
+		src_err_list[nerrs++] = err;
+		src_in_err[err] = 1;
+		if (err < k)
+			nsrcerrs = 1;
+	}
+	*pnerrs = nerrs;
+	*pnsrcerrs = nsrcerrs;
+	return;
+}
+
+#define NO_INVERT_MATRIX -2
+// Generate decode matrix from encode matrix
+static int gf_gen_decode_matrix(unsigned char *encode_matrix,
+				unsigned char *decode_matrix,
+				unsigned char *invert_matrix,
+				unsigned int *decode_index,
+				unsigned char *src_err_list,
+				unsigned char *src_in_err,
+				int nerrs, int nsrcerrs, int k, int m)
+{
+	int i, j, p;
+	int r;
+	unsigned char *backup, *b, s;
+	int incr = 0;
+
+	b = malloc(MMAX * KMAX);
+	backup = malloc(MMAX * KMAX);
+
+	if (b == NULL || backup == NULL) {
+		printf("Test failure! Error with malloc\n");
+		free(b);
+		free(backup);
+		return -1;
+	}
+	// Construct matrix b by removing error rows
+	for (i = 0, r = 0; i < k; i++, r++) {
+		while (src_in_err[r])
+			r++;
+		for (j = 0; j < k; j++) {
+			b[k * i + j] = encode_matrix[k * r + j];
+			backup[k * i + j] = encode_matrix[k * r + j];
+		}
+		decode_index[i] = r;
+	}
+	incr = 0;
+	while (gf_invert_matrix(b, invert_matrix, k) < 0) {
+		if (nerrs == (m - k)) {
+			free(b);
+			free(backup);
+			printf("BAD MATRIX\n");
+			return NO_INVERT_MATRIX;
+		}
+		incr++;
+		memcpy(b, backup, MMAX * KMAX);
+		for (i = nsrcerrs; i < nerrs - nsrcerrs; i++) {
+			if (src_err_list[i] == (decode_index[k - 1] + incr)) {
+				// skip the erased parity line
+				incr++;
+				continue;
+			}
+		}
+		if (decode_index[k - 1] + incr >= m) {
+			free(b);
+			free(backup);
+			printf("BAD MATRIX\n");
+			return NO_INVERT_MATRIX;
+		}
+		decode_index[k - 1] += incr;
+		for (j = 0; j < k; j++)
+			b[k * (k - 1) + j] = encode_matrix[k * decode_index[k - 1] + j];
+
+	};
+
+	for (i = 0; i < nsrcerrs; i++) {
+		for (j = 0; j < k; j++) {
+			decode_matrix[k * i + j] = invert_matrix[k * src_err_list[i] + j];
+		}
+	}
+	/* src_err_list from encode_matrix * invert of b for parity decoding */
+	for (p = nsrcerrs; p < nerrs; p++) {
+		for (i = 0; i < k; i++) {
+			s = 0;
+			for (j = 0; j < k; j++)
+				s ^= gf_mul(invert_matrix[j * k + i],
+					    encode_matrix[k * src_err_list[p] + j]);
+
+			decode_matrix[k * p + i] = s;
+		}
+	}
+	free(b);
+	free(backup);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int re = 0;
+	int i, j, p, rtest, m, k;
+	int nerrs, nsrcerrs;
+	void *buf;
+	unsigned int decode_index[MMAX];
+	unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
+	unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
+	unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
+	unsigned char *recov[TEST_SOURCES];
+
+	int rows, align, size;
+	unsigned char *efence_buffs[TEST_SOURCES];
+	unsigned int offset;
+	u8 *ubuffs[TEST_SOURCES];
+	u8 *temp_ubuffs[TEST_SOURCES];
+
+	printf("erasure_code_test: %dx%d ", TEST_SOURCES, TEST_LEN);
+	srand(TEST_SEED);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		temp_buffs[i] = buf;
+	}
+
+	// Test erasure code by encode and recovery
+
+	encode_matrix = malloc(MMAX * KMAX);
+	decode_matrix = malloc(MMAX * KMAX);
+	invert_matrix = malloc(MMAX * KMAX);
+	g_tbls = malloc(KMAX * TEST_SOURCES * 32);
+	if (encode_matrix == NULL || decode_matrix == NULL
+	    || invert_matrix == NULL || g_tbls == NULL) {
+		printf("Test failure! Error with malloc\n");
+		return -1;
+	}
+	// Pick a first test
+	m = 9;
+	k = 5;
+	if (m > MMAX || k > KMAX)
+		return -1;
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	// Generate encode matrix encode_matrix
+	// The matrix generated by gf_gen_rs_matrix
+	// is not always invertable.
+	gf_gen_rs_matrix(encode_matrix, m, k);
+
+	// Generate g_tbls from encode matrix encode_matrix
+	ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+
+	// Perform matrix dot_prod for EC encoding
+	// using g_tbls from encode matrix encode_matrix
+	ec_encode_data(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
+
+	// Choose random buffers to be in erasure
+	memset(src_in_err, 0, TEST_SOURCES);
+	gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+	// Generate decode matrix
+	re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+				  invert_matrix, decode_index, src_err_list, src_in_err,
+				  nerrs, nsrcerrs, k, m);
+	if (re != 0) {
+		printf("Fail to gf_gen_decode_matrix\n");
+		return -1;
+	}
+	// Pack recovery array as list of valid sources
+	// Its order must be the same as the order
+	// to generate matrix b in gf_gen_decode_matrix
+	for (i = 0; i < k; i++) {
+		recov[i] = buffs[decode_index[i]];
+	}
+
+	// Recover data
+	ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+	ec_encode_data(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+	for (i = 0; i < nerrs; i++) {
+
+		if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
+			printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
+			printf(" - erase list = ");
+			for (j = 0; j < nerrs; j++)
+				printf(" %d", src_err_list[j]);
+			printf(" - Index = ");
+			for (p = 0; p < k; p++)
+				printf(" %d", decode_index[p]);
+			printf("\nencode_matrix:\n");
+			dump_u8xu8((u8 *) encode_matrix, m, k);
+			printf("inv b:\n");
+			dump_u8xu8((u8 *) invert_matrix, k, k);
+			printf("\ndecode_matrix:\n");
+			dump_u8xu8((u8 *) decode_matrix, m, k);
+			printf("recov %d:", src_err_list[i]);
+			dump(temp_buffs[k + i], 25);
+			printf("orig   :");
+			dump(buffs[src_err_list[i]], 25);
+			return -1;
+		}
+	}
+
+	// Pick a first test
+	m = 9;
+	k = 5;
+	if (m > MMAX || k > KMAX)
+		return -1;
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	// The matrix generated by gf_gen_cauchy1_matrix
+	// is always invertable.
+	gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+	// Generate g_tbls from encode matrix encode_matrix
+	ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+
+	// Perform matrix dot_prod for EC encoding
+	// using g_tbls from encode matrix encode_matrix
+	ec_encode_data(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
+
+	// Choose random buffers to be in erasure
+	memset(src_in_err, 0, TEST_SOURCES);
+	gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+	// Generate decode matrix
+	re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+				  invert_matrix, decode_index, src_err_list, src_in_err,
+				  nerrs, nsrcerrs, k, m);
+	if (re != 0) {
+		printf("Fail to gf_gen_decode_matrix\n");
+		return -1;
+	}
+	// Pack recovery array as list of valid sources
+	// Its order must be the same as the order
+	// to generate matrix b in gf_gen_decode_matrix
+	for (i = 0; i < k; i++) {
+		recov[i] = buffs[decode_index[i]];
+	}
+
+	// Recover data
+	ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+	ec_encode_data(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+	for (i = 0; i < nerrs; i++) {
+
+		if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
+			printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
+			printf(" - erase list = ");
+			for (j = 0; j < nerrs; j++)
+				printf(" %d", src_err_list[j]);
+			printf(" - Index = ");
+			for (p = 0; p < k; p++)
+				printf(" %d", decode_index[p]);
+			printf("\nencode_matrix:\n");
+			dump_u8xu8((u8 *) encode_matrix, m, k);
+			printf("inv b:\n");
+			dump_u8xu8((u8 *) invert_matrix, k, k);
+			printf("\ndecode_matrix:\n");
+			dump_u8xu8((u8 *) decode_matrix, m, k);
+			printf("recov %d:", src_err_list[i]);
+			dump(temp_buffs[k + i], 25);
+			printf("orig   :");
+			dump(buffs[src_err_list[i]], 25);
+			return -1;
+		}
+	}
+
+	// Do more random tests
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		// Make random data
+		for (i = 0; i < k; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		// The matrix generated by gf_gen_cauchy1_matrix
+		// is always invertable.
+		gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+		// Make parity vects
+		// Generate g_tbls from encode matrix a
+		ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+		// Perform matrix dot_prod for EC encoding
+		// using g_tbls from encode matrix a
+		ec_encode_data(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+		// Generate decode matrix
+		re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+					  invert_matrix, decode_index, src_err_list,
+					  src_in_err, nerrs, nsrcerrs, k, m);
+		if (re != 0) {
+			printf("Fail to gf_gen_decode_matrix\n");
+			return -1;
+		}
+		// Pack recovery array as list of valid sources
+		// Its order must be the same as the order
+		// to generate matrix b in gf_gen_decode_matrix
+		for (i = 0; i < k; i++) {
+			recov[i] = buffs[decode_index[i]];
+		}
+
+		// Recover data
+		ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+		ec_encode_data(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+
+		for (i = 0; i < nerrs; i++) {
+
+			if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (j = 0; j < nerrs; j++)
+					printf(" %d", src_err_list[j]);
+				printf(" - Index = ");
+				for (p = 0; p < k; p++)
+					printf(" %d", decode_index[p]);
+				printf("\nencode_matrix:\n");
+				dump_u8xu8((u8 *) encode_matrix, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((u8 *) invert_matrix, k, k);
+				printf("\ndecode_matrix:\n");
+				dump_u8xu8((u8 *) decode_matrix, m, k);
+				printf("orig data:\n");
+				dump_matrix(buffs, m, 25);
+				printf("orig   :");
+				dump(buffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_buffs[k + i], 25);
+				return -1;
+			}
+		}
+		putchar('.');
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	k = 16;
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+	if (k > KMAX)
+		return -1;
+
+	for (rows = 1; rows <= 16; rows++) {
+		m = k + rows;
+		if (m > MMAX)
+			return -1;
+
+		// Make random data
+		for (i = 0; i < k; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (size = EFENCE_TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
+			for (i = 0; i < m; i++) {	// Line up TEST_SIZE from end
+				efence_buffs[i] = buffs[i] + TEST_LEN - size;
+			}
+
+			// The matrix generated by gf_gen_cauchy1_matrix
+			// is always invertable.
+			gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+			// Make parity vects
+			// Generate g_tbls from encode matrix a
+			ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+			// Perform matrix dot_prod for EC encoding
+			// using g_tbls from encode matrix a
+			ec_encode_data(size, k, m - k, g_tbls, efence_buffs, &efence_buffs[k]);
+
+			// Random errors
+			memset(src_in_err, 0, TEST_SOURCES);
+			gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+			// Generate decode matrix
+			re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+						  invert_matrix, decode_index, src_err_list,
+						  src_in_err, nerrs, nsrcerrs, k, m);
+			if (re != 0) {
+				printf("Fail to gf_gen_decode_matrix\n");
+				return -1;
+			}
+			// Pack recovery array as list of valid sources
+			// Its order must be the same as the order
+			// to generate matrix b in gf_gen_decode_matrix
+			for (i = 0; i < k; i++) {
+				recov[i] = efence_buffs[decode_index[i]];
+			}
+
+			// Recover data
+			ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+			ec_encode_data(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+
+			for (i = 0; i < nerrs; i++) {
+
+				if (0 !=
+				    memcmp(temp_buffs[k + i], efence_buffs[src_err_list[i]],
+					   size)) {
+					printf("Efence: Fail error recovery (%d, %d, %d)\n", m,
+					       k, nerrs);
+
+					printf("size = %d\n", size);
+
+					printf("Test erase list = ");
+					for (j = 0; j < nerrs; j++)
+						printf(" %d", src_err_list[j]);
+					printf(" - Index = ");
+					for (p = 0; p < k; p++)
+						printf(" %d", decode_index[p]);
+					printf("\nencode_matrix:\n");
+					dump_u8xu8((u8 *) encode_matrix, m, k);
+					printf("inv b:\n");
+					dump_u8xu8((u8 *) invert_matrix, k, k);
+					printf("\ndecode_matrix:\n");
+					dump_u8xu8((u8 *) decode_matrix, m, k);
+
+					printf("recov %d:", src_err_list[i]);
+					dump(temp_buffs[k + i], align);
+					printf("orig   :");
+					dump(efence_buffs[src_err_list[i]], align);
+					return -1;
+				}
+			}
+		}
+
+	}
+
+	// Test rand ptr alignment if available
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
+
+		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
+		// Add random offsets
+		for (i = 0; i < m; i++) {
+			memset(buffs[i], 0, TEST_LEN);	// zero pad to check write-over
+			memset(temp_buffs[i], 0, TEST_LEN);	// zero pad to check write-over
+			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+			temp_ubuffs[i] = temp_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+		}
+
+		for (i = 0; i < k; i++)
+			for (j = 0; j < size; j++)
+				ubuffs[i][j] = rand();
+
+		// The matrix generated by gf_gen_cauchy1_matrix
+		// is always invertable.
+		gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+		// Make parity vects
+		// Generate g_tbls from encode matrix a
+		ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+		// Perform matrix dot_prod for EC encoding
+		// using g_tbls from encode matrix a
+		ec_encode_data(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]);
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+		// Generate decode matrix
+		re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+					  invert_matrix, decode_index, src_err_list,
+					  src_in_err, nerrs, nsrcerrs, k, m);
+		if (re != 0) {
+			printf("Fail to gf_gen_decode_matrix\n");
+			return -1;
+		}
+		// Pack recovery array as list of valid sources
+		// Its order must be the same as the order
+		// to generate matrix b in gf_gen_decode_matrix
+		for (i = 0; i < k; i++) {
+			recov[i] = ubuffs[decode_index[i]];
+		}
+
+		// Recover data
+		ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+		ec_encode_data(size, k, nerrs, g_tbls, recov, &temp_ubuffs[k]);
+
+		for (i = 0; i < nerrs; i++) {
+
+			if (0 != memcmp(temp_ubuffs[k + i], ubuffs[src_err_list[i]], size)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (j = 0; j < nerrs; j++)
+					printf(" %d", src_err_list[j]);
+				printf(" - Index = ");
+				for (p = 0; p < k; p++)
+					printf(" %d", decode_index[p]);
+				printf("\nencode_matrix:\n");
+				dump_u8xu8((unsigned char *)encode_matrix, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((unsigned char *)invert_matrix, k, k);
+				printf("\ndecode_matrix:\n");
+				dump_u8xu8((unsigned char *)decode_matrix, m, k);
+				printf("orig data:\n");
+				dump_matrix(ubuffs, m, 25);
+				printf("orig   :");
+				dump(ubuffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_ubuffs[k + i], 25);
+				return -1;
+			}
+		}
+
+		// Confirm that padding around dests is unchanged
+		memset(temp_buffs[0], 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
+
+		for (i = 0; i < m; i++) {
+
+			offset = ubuffs[i] - buffs[i];
+
+			if (memcmp(buffs[i], temp_buffs[0], offset)) {
+				printf("Fail rand ualign encode pad start\n");
+				return -1;
+			}
+			if (memcmp
+			    (buffs[i] + offset + size, temp_buffs[0],
+			     PTR_ALIGN_CHK_B - offset)) {
+				printf("Fail rand ualign encode pad end\n");
+				return -1;
+			}
+		}
+
+		for (i = 0; i < nerrs; i++) {
+
+			offset = temp_ubuffs[k + i] - temp_buffs[k + i];
+			if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
+				printf("Fail rand ualign decode pad start\n");
+				return -1;
+			}
+			if (memcmp
+			    (temp_buffs[k + i] + offset + size, temp_buffs[0],
+			     PTR_ALIGN_CHK_B - offset)) {
+				printf("Fail rand ualign decode pad end\n");
+				return -1;
+			}
+		}
+
+		putchar('.');
+	}
+
+	// Test size alignment
+
+	align = (LEN_ALIGN_CHK_B != 0) ? 13 : 16;
+
+	for (size = TEST_LEN; size > 0; size -= align) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		for (i = 0; i < k; i++)
+			for (j = 0; j < size; j++)
+				buffs[i][j] = rand();
+
+		// The matrix generated by gf_gen_cauchy1_matrix
+		// is always invertable.
+		gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+		// Make parity vects
+		// Generate g_tbls from encode matrix a
+		ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+		// Perform matrix dot_prod for EC encoding
+		// using g_tbls from encode matrix a
+		ec_encode_data(size, k, m - k, g_tbls, buffs, &buffs[k]);
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+		// Generate decode matrix
+		re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+					  invert_matrix, decode_index, src_err_list,
+					  src_in_err, nerrs, nsrcerrs, k, m);
+		if (re != 0) {
+			printf("Fail to gf_gen_decode_matrix\n");
+			return -1;
+		}
+		// Pack recovery array as list of valid sources
+		// Its order must be the same as the order
+		// to generate matrix b in gf_gen_decode_matrix
+		for (i = 0; i < k; i++) {
+			recov[i] = buffs[decode_index[i]];
+		}
+
+		// Recover data
+		ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+		ec_encode_data(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+
+		for (i = 0; i < nerrs; i++) {
+
+			if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], size)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (j = 0; j < nerrs; j++)
+					printf(" %d", src_err_list[j]);
+				printf(" - Index = ");
+				for (p = 0; p < k; p++)
+					printf(" %d", decode_index[p]);
+				printf("\nencode_matrix:\n");
+				dump_u8xu8((unsigned char *)encode_matrix, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((unsigned char *)invert_matrix, k, k);
+				printf("\ndecode_matrix:\n");
+				dump_u8xu8((unsigned char *)decode_matrix, m, k);
+				printf("orig data:\n");
+				dump_matrix(buffs, m, 25);
+				printf("orig   :");
+				dump(buffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_buffs[k + i], 25);
+				return -1;
+			}
+		}
+	}
+
+	printf("done EC tests: Pass\n");
+	return 0;
+}
--- a/erasure_code/erasure_code_update_perf.c
+++ b/erasure_code/erasure_code_update_perf.c
@ -0,0 +1,306 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+#include "test.h"
+
+//By default, test multibinary version
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST ec_encode_data_update
+# define REF_FUNCTION ec_encode_data
+#endif
+
+//By default, test EC(8+4)
+#if (!defined(VECT))
+# define VECT 4
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 32
+# define TEST_LEN(m)  ((128*1024 / m) & ~(64-1))
+# define TEST_LOOPS(m)   (10000*m)
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 32
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN(m)  ((GT_L3_CACHE / m) & ~(64-1))
+#  define TEST_LOOPS(m)   (50*m)
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS(m) 1000
+#  endif
+# endif
+#endif
+
+#define MMAX TEST_SOURCES
+#define KMAX TEST_SOURCES
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, m, k, nerrs, r;
+	void *buf;
+	u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
+	u8 *update_buffs[TEST_SOURCES];
+	u8 *perf_update_buffs[TEST_SOURCES];
+	u8 a[MMAX * KMAX], b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX];
+	u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
+	u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
+	struct perf start, stop;
+
+	// Pick test parameters
+	k = 10;
+	m = k + VECT;
+	nerrs = VECT;
+	const u8 err_list[] = { 0, 2, 4, 5, 7, 8 };
+
+	printf(xstr(FUNCTION_UNDER_TEST) "_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
+
+	if (m > MMAX || k > KMAX || nerrs > (m - k)) {
+		printf(" Input test parameter error\n");
+		return -1;
+	}
+
+	memcpy(src_err_list, err_list, nerrs);
+	memset(src_in_err, 0, TEST_SOURCES);
+	for (i = 0; i < nerrs; i++)
+		src_in_err[src_err_list[i]] = 1;
+
+	// Allocate the arrays
+	for (i = 0; i < m; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN(m))) {
+			printf("alloc error: Fail\n");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	for (i = 0; i < (m - k); i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN(m))) {
+			printf("alloc error: Fail\n");
+			return -1;
+		}
+		temp_buffs[i] = buf;
+		memset(temp_buffs[i], 0, TEST_LEN(m));	// initialize the destination buffer to be zero for update function
+	}
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN(m))) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		update_buffs[i] = buf;
+		memset(update_buffs[i], 0, TEST_LEN(m));	// initialize the destination buffer to be zero for update function
+	}
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN(m))) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		perf_update_buffs[i] = buf;
+		memset(perf_update_buffs[i], 0, TEST_LEN(m));	// initialize the destination buffer to be zero for update function
+	}
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN(m); j++) {
+			buffs[i][j] = rand();
+			update_buffs[i][j] = buffs[i][j];
+		}
+
+	gf_gen_rs_matrix(a, m, k);
+	ec_init_tables(k, m - k, &a[k * k], g_tbls);
+	REF_FUNCTION(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
+
+	for (i = 0; i < k; i++) {
+		FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, i, g_tbls, update_buffs[i],
+				    &update_buffs[k]);
+	}
+	for (i = 0; i < m - k; i++) {
+		if (0 != memcmp(update_buffs[k + i], buffs[k + i], TEST_LEN(m))) {
+			printf("\nupdate_buffs%d  :", i);
+			dump(update_buffs[k + i], 25);
+			printf("buffs%d         :", i);
+			dump(buffs[k + i], 25);
+			return -1;
+		}
+	}
+
+#ifdef DO_REF_PERF
+	REF_FUNCTION(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
+	// Start encode test
+	perf_start(&start);
+	for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
+		// Make parity vects
+		ec_init_tables(k, m - k, &a[k * k], g_tbls);
+		REF_FUNCTION(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
+	}
+	perf_stop(&stop);
+	printf(xstr(REF_FUNCTION) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
+#endif
+	for (i = 0; i < k; i++) {
+		FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, i, g_tbls, perf_update_buffs[i],
+				    &perf_update_buffs[k]);
+	}
+	// Start encode test
+	perf_start(&start);
+	for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
+		// Make parity vects
+		ec_init_tables(k, m - k, &a[k * k], g_tbls);
+		for (i = 0; i < k; i++) {
+			FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, i, g_tbls,
+					    perf_update_buffs[i], &perf_update_buffs[k]);
+		}
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
+
+	// Start encode test
+	perf_start(&start);
+	for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
+		// Make parity vects
+		ec_init_tables(k, m - k, &a[k * k], g_tbls);
+		FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, 0, g_tbls, perf_update_buffs[0],
+				    &perf_update_buffs[k]);
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) "_single_src" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)(TEST_LEN(m)) * (m - k + 1) * rtest);
+
+	// Start encode test
+	perf_start(&start);
+	for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
+		// Make parity vects
+		FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, 0, g_tbls, perf_update_buffs[0],
+				    &perf_update_buffs[k]);
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) "_single_src_simple" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)(TEST_LEN(m)) * (m - k + 1) * rtest);
+
+	for (i = k; i < m; i++) {
+		memset(update_buffs[i], 0, TEST_LEN(m));	// initialize the destination buffer to be zero for update function
+	}
+	for (i = 0; i < k; i++) {
+		FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, i, g_tbls, update_buffs[i],
+				    &update_buffs[k]);
+	}
+	// Construct b by removing error rows
+	for (i = 0, r = 0; i < k; i++, r++) {
+		while (src_in_err[r])
+			r++;
+		recov[i] = update_buffs[r];
+		for (j = 0; j < k; j++)
+			b[k * i + j] = a[k * r + j];
+	}
+
+	if (gf_invert_matrix(b, d, k) < 0) {
+		printf("BAD MATRIX\n");
+		return -1;
+	}
+
+	for (i = 0; i < nerrs; i++)
+		for (j = 0; j < k; j++)
+			c[k * i + j] = d[k * src_err_list[i] + j];
+
+	// Recover data
+	ec_init_tables(k, nerrs, c, g_tbls);
+	for (i = 0; i < k; i++) {
+		FUNCTION_UNDER_TEST(TEST_LEN(m), k, nerrs, i, g_tbls, recov[i], temp_buffs);
+	}
+	// Start decode test
+	perf_start(&start);
+	for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
+		// Construct b by removing error rows
+		for (i = 0, r = 0; i < k; i++, r++) {
+			while (src_in_err[r])
+				r++;
+			recov[i] = update_buffs[r];
+			for (j = 0; j < k; j++)
+				b[k * i + j] = a[k * r + j];
+		}
+
+		if (gf_invert_matrix(b, d, k) < 0) {
+			printf("BAD MATRIX\n");
+			return -1;
+		}
+
+		for (i = 0; i < nerrs; i++)
+			for (j = 0; j < k; j++)
+				c[k * i + j] = d[k * src_err_list[i] + j];
+
+		// Recover data
+		ec_init_tables(k, nerrs, c, g_tbls);
+		for (i = 0; i < k; i++) {
+			FUNCTION_UNDER_TEST(TEST_LEN(m), k, nerrs, i, g_tbls, recov[i],
+					    perf_update_buffs);
+		}
+	}
+	perf_stop(&stop);
+
+	for (i = 0; i < nerrs; i++) {
+		if (0 != memcmp(temp_buffs[i], update_buffs[src_err_list[i]], TEST_LEN(m))) {
+			printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+			return -1;
+		}
+	}
+
+	printf(xstr(FUNCTION_UNDER_TEST) "_decode" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)(TEST_LEN(m)) * (k + nerrs) * rtest);
+
+	printf("done all: Pass\n");
+	return 0;
+}
--- a/erasure_code/erasure_code_update_test.c
+++ b/erasure_code/erasure_code_update_test.c
@ -0,0 +1,957 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#ifndef ALIGN_SIZE
+# define ALIGN_SIZE 16
+#endif
+
+//By default, test multibinary version
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST ec_encode_data_update
+# define REF_FUNCTION ec_encode_data
+#endif
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  127
+#endif
+#ifndef RANDOMS
+# define RANDOMS 200
+#endif
+
+#define MMAX TEST_SOURCES
+#define KMAX TEST_SOURCES
+
+#ifdef EC_ALIGNED_ADDR
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 0
+# define LEN_ALIGN_CHK_B 0	// 0 for aligned only
+#else
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B ALIGN_SIZE
+# define LEN_ALIGN_CHK_B ALIGN_SIZE	// 0 for aligned only
+#endif
+
+#ifndef TEST_SEED
+#define TEST_SEED 11
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+// Generate Random errors
+static void gen_err_list(unsigned char *src_err_list,
+			 unsigned char *src_in_err, int *pnerrs, int *pnsrcerrs, int k, int m)
+{
+	int i, err;
+	int nerrs = 0, nsrcerrs = 0;
+
+	for (i = 0, nerrs = 0, nsrcerrs = 0; i < m && nerrs < m - k; i++) {
+		err = 1 & rand();
+		src_in_err[i] = err;
+		if (err) {
+			src_err_list[nerrs++] = i;
+			if (i < k) {
+				nsrcerrs++;
+			}
+		}
+	}
+	if (nerrs == 0) {	// should have at least one error
+		while ((err = (rand() % KMAX)) >= m) ;
+		src_err_list[nerrs++] = err;
+		src_in_err[err] = 1;
+		if (err < k)
+			nsrcerrs = 1;
+	}
+	*pnerrs = nerrs;
+	*pnsrcerrs = nsrcerrs;
+	return;
+}
+
+#define NO_INVERT_MATRIX -2
+// Generate decode matrix from encode matrix
+static int gf_gen_decode_matrix(unsigned char *encode_matrix,
+				unsigned char *decode_matrix,
+				unsigned char *invert_matrix,
+				unsigned int *decode_index,
+				unsigned char *src_err_list,
+				unsigned char *src_in_err,
+				int nerrs, int nsrcerrs, int k, int m)
+{
+	int i, j, p;
+	int r;
+	unsigned char *backup, *b, s;
+	int incr = 0;
+
+	b = malloc(MMAX * KMAX);
+	backup = malloc(MMAX * KMAX);
+
+	if (b == NULL || backup == NULL) {
+		printf("Test failure! Error with malloc\n");
+		free(b);
+		free(backup);
+		return -1;
+	}
+	// Construct matrix b by removing error rows
+	for (i = 0, r = 0; i < k; i++, r++) {
+		while (src_in_err[r])
+			r++;
+		for (j = 0; j < k; j++) {
+			b[k * i + j] = encode_matrix[k * r + j];
+			backup[k * i + j] = encode_matrix[k * r + j];
+		}
+		decode_index[i] = r;
+	}
+	incr = 0;
+	while (gf_invert_matrix(b, invert_matrix, k) < 0) {
+		if (nerrs == (m - k)) {
+			free(b);
+			free(backup);
+			printf("BAD MATRIX\n");
+			return NO_INVERT_MATRIX;
+		}
+		incr++;
+		memcpy(b, backup, MMAX * KMAX);
+		for (i = nsrcerrs; i < nerrs - nsrcerrs; i++) {
+			if (src_err_list[i] == (decode_index[k - 1] + incr)) {
+				// skip the erased parity line
+				incr++;
+				continue;
+			}
+		}
+		if (decode_index[k - 1] + incr >= m) {
+			free(b);
+			free(backup);
+			printf("BAD MATRIX\n");
+			return NO_INVERT_MATRIX;
+		}
+		decode_index[k - 1] += incr;
+		for (j = 0; j < k; j++)
+			b[k * (k - 1) + j] = encode_matrix[k * decode_index[k - 1] + j];
+
+	};
+
+	for (i = 0; i < nsrcerrs; i++) {
+		for (j = 0; j < k; j++) {
+			decode_matrix[k * i + j] = invert_matrix[k * src_err_list[i] + j];
+		}
+	}
+	/* src_err_list from encode_matrix * invert of b for parity decoding */
+	for (p = nsrcerrs; p < nerrs; p++) {
+		for (i = 0; i < k; i++) {
+			s = 0;
+			for (j = 0; j < k; j++)
+				s ^= gf_mul(invert_matrix[j * k + i],
+					    encode_matrix[k * src_err_list[p] + j]);
+
+			decode_matrix[k * p + i] = s;
+		}
+	}
+	free(b);
+	free(backup);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int re = 0;
+	int i, j, p, rtest, m, k;
+	int nerrs, nsrcerrs;
+	void *buf;
+	unsigned int decode_index[MMAX];
+	unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
+	unsigned char *update_buffs[TEST_SOURCES];
+	unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
+	unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
+	unsigned char *recov[TEST_SOURCES];
+
+	int rows, align, size;
+	unsigned char *efence_buffs[TEST_SOURCES];
+	unsigned char *efence_update_buffs[TEST_SOURCES];
+	unsigned int offset;
+	u8 *ubuffs[TEST_SOURCES];
+	u8 *update_ubuffs[TEST_SOURCES];
+	u8 *temp_ubuffs[TEST_SOURCES];
+
+	printf("test " xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
+	srand(TEST_SEED);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		temp_buffs[i] = buf;
+		memset(temp_buffs[i], 0, TEST_LEN);	// initialize the destination buffer to be zero for update function
+	}
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		update_buffs[i] = buf;
+		memset(update_buffs[i], 0, TEST_LEN);	// initialize the destination buffer to be zero for update function
+	}
+	// Test erasure code by encode and recovery
+
+	encode_matrix = malloc(MMAX * KMAX);
+	decode_matrix = malloc(MMAX * KMAX);
+	invert_matrix = malloc(MMAX * KMAX);
+	g_tbls = malloc(KMAX * TEST_SOURCES * 32);
+	if (encode_matrix == NULL || decode_matrix == NULL
+	    || invert_matrix == NULL || g_tbls == NULL) {
+		printf("Test failure! Error with malloc\n");
+		return -1;
+	}
+	// Pick a first test
+	m = 15;
+	k = 10;
+	if (m > MMAX || k > KMAX)
+		return -1;
+
+	// Make random data
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < TEST_LEN; j++) {
+			buffs[i][j] = rand();
+			update_buffs[i][j] = buffs[i][j];
+		}
+	}
+
+	// Generate encode matrix encode_matrix
+	// The matrix generated by gf_gen_rs_matrix
+	// is not always invertable.
+	gf_gen_rs_matrix(encode_matrix, m, k);
+
+	// Generate g_tbls from encode matrix encode_matrix
+	ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+
+	// Perform matrix dot_prod for EC encoding
+	// using g_tbls from encode matrix encode_matrix
+	REF_FUNCTION(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
+	for (i = 0; i < k; i++) {
+		FUNCTION_UNDER_TEST(TEST_LEN, k, m - k, i, g_tbls, update_buffs[i],
+				    &update_buffs[k]);
+	}
+	for (i = 0; i < m - k; i++) {
+		if (0 != memcmp(update_buffs[k + i], buffs[k + i], TEST_LEN)) {
+			printf("\nupdate_buffs%d  :", i);
+			dump(update_buffs[k + i], 25);
+			printf("buffs%d         :", i);
+			dump(buffs[k + i], 25);
+			return -1;
+		}
+	}
+
+	// Choose random buffers to be in erasure
+	memset(src_in_err, 0, TEST_SOURCES);
+	gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+	// Generate decode matrix
+	re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+				  invert_matrix, decode_index, src_err_list, src_in_err,
+				  nerrs, nsrcerrs, k, m);
+	if (re != 0) {
+		printf("Fail to gf_gen_decode_matrix\n");
+		return -1;
+	}
+	// Pack recovery array as list of valid sources
+	// Its order must be the same as the order
+	// to generate matrix b in gf_gen_decode_matrix
+	for (i = 0; i < k; i++) {
+		recov[i] = update_buffs[decode_index[i]];
+	}
+
+	// Recover data
+	ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+	REF_FUNCTION(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
+	for (i = 0; i < nerrs; i++) {
+
+		if (0 != memcmp(temp_buffs[k + i], update_buffs[src_err_list[i]], TEST_LEN)) {
+			printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
+			printf(" - erase list = ");
+			for (j = 0; j < nerrs; j++)
+				printf(" %d", src_err_list[j]);
+			printf(" - Index = ");
+			for (p = 0; p < k; p++)
+				printf(" %d", decode_index[p]);
+			printf("\nencode_matrix:\n");
+			dump_u8xu8((u8 *) encode_matrix, m, k);
+			printf("inv b:\n");
+			dump_u8xu8((u8 *) invert_matrix, k, k);
+			printf("\ndecode_matrix:\n");
+			dump_u8xu8((u8 *) decode_matrix, m, k);
+			printf("recov %d:", src_err_list[i]);
+			dump(temp_buffs[k + i], 25);
+			printf("orig   :");
+			dump(update_buffs[src_err_list[i]], 25);
+			return -1;
+		}
+	}
+	putchar('.');
+
+	// Pick a first test
+	m = 7;
+	k = 5;
+	if (m > MMAX || k > KMAX)
+		return -1;
+
+	// Zero the destination buffer for update function
+	for (i = k; i < TEST_SOURCES; i++) {
+		memset(buffs[i], 0, TEST_LEN);
+		memset(update_buffs[i], 0, TEST_LEN);
+	}
+	// Make random data
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < TEST_LEN; j++) {
+			buffs[i][j] = rand();
+			update_buffs[i][j] = buffs[i][j];
+		}
+	}
+
+	// The matrix generated by gf_gen_cauchy1_matrix
+	// is always invertable.
+	gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+	// Generate g_tbls from encode matrix encode_matrix
+	ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+
+	// Perform matrix dot_prod for EC encoding
+	// using g_tbls from encode matrix encode_matrix
+	REF_FUNCTION(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
+	for (i = 0; i < k; i++) {
+		FUNCTION_UNDER_TEST(TEST_LEN, k, m - k, i, g_tbls, update_buffs[i],
+				    &update_buffs[k]);
+	}
+	for (i = 0; i < m - k; i++) {
+		if (0 != memcmp(update_buffs[k + i], buffs[k + i], TEST_LEN)) {
+			printf("\nupdate_buffs%d  :", i);
+			dump(update_buffs[k + i], 25);
+			printf("buffs%d         :", i);
+			dump(buffs[k + i], 25);
+			return -1;
+		}
+	}
+
+	// Choose random buffers to be in erasure
+	memset(src_in_err, 0, TEST_SOURCES);
+	gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+	// Generate decode matrix
+	re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+				  invert_matrix, decode_index, src_err_list, src_in_err,
+				  nerrs, nsrcerrs, k, m);
+	if (re != 0) {
+		printf("Fail to gf_gen_decode_matrix\n");
+		return -1;
+	}
+	// Pack recovery array as list of valid sources
+	// Its order must be the same as the order
+	// to generate matrix b in gf_gen_decode_matrix
+	for (i = 0; i < k; i++) {
+		recov[i] = update_buffs[decode_index[i]];
+	}
+
+	// Recover data
+	for (i = 0; i < TEST_SOURCES; i++) {
+		memset(temp_buffs[i], 0, TEST_LEN);
+	}
+	ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+	for (i = 0; i < k; i++) {
+		FUNCTION_UNDER_TEST(TEST_LEN, k, nerrs, i, g_tbls, recov[i], &temp_buffs[k]);
+	}
+	for (i = 0; i < nerrs; i++) {
+
+		if (0 != memcmp(temp_buffs[k + i], update_buffs[src_err_list[i]], TEST_LEN)) {
+			printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
+			printf(" - erase list = ");
+			for (j = 0; j < nerrs; j++)
+				printf(" %d", src_err_list[j]);
+			printf(" - Index = ");
+			for (p = 0; p < k; p++)
+				printf(" %d", decode_index[p]);
+			printf("\nencode_matrix:\n");
+			dump_u8xu8((u8 *) encode_matrix, m, k);
+			printf("inv b:\n");
+			dump_u8xu8((u8 *) invert_matrix, k, k);
+			printf("\ndecode_matrix:\n");
+			dump_u8xu8((u8 *) decode_matrix, m, k);
+			printf("recov %d:", src_err_list[i]);
+			dump(temp_buffs[k + i], 25);
+			printf("orig   :");
+			dump(update_buffs[src_err_list[i]], 25);
+			return -1;
+		}
+	}
+	putchar('.');
+
+	// Do more random tests
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		// Zero the destination buffer for update function
+		for (i = k; i < TEST_SOURCES; i++) {
+			memset(buffs[i], 0, TEST_LEN);
+			memset(update_buffs[i], 0, TEST_LEN);
+		}
+		// Make random data
+		for (i = 0; i < k; i++) {
+			for (j = 0; j < TEST_LEN; j++) {
+				buffs[i][j] = rand();
+				update_buffs[i][j] = buffs[i][j];
+			}
+		}
+
+		// The matrix generated by gf_gen_cauchy1_matrix
+		// is always invertable.
+		gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+		// Make parity vects
+		// Generate g_tbls from encode matrix a
+		ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+		// Perform matrix dot_prod for EC encoding
+		// using g_tbls from encode matrix a
+		REF_FUNCTION(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
+		for (i = 0; i < k; i++) {
+			FUNCTION_UNDER_TEST(TEST_LEN, k, m - k, i, g_tbls, update_buffs[i],
+					    &update_buffs[k]);
+		}
+		for (i = 0; i < m - k; i++) {
+			if (0 != memcmp(update_buffs[k + i], buffs[k + i], TEST_LEN)) {
+				printf("\nupdate_buffs%d  :", i);
+				dump(update_buffs[k + i], 25);
+				printf("buffs%d         :", i);
+				dump(buffs[k + i], 25);
+				return -1;
+			}
+		}
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+		// Generate decode matrix
+		re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+					  invert_matrix, decode_index, src_err_list,
+					  src_in_err, nerrs, nsrcerrs, k, m);
+		if (re != 0) {
+			printf("Fail to gf_gen_decode_matrix\n");
+			return -1;
+		}
+		// Pack recovery array as list of valid sources
+		// Its order must be the same as the order
+		// to generate matrix b in gf_gen_decode_matrix
+		for (i = 0; i < k; i++) {
+			recov[i] = update_buffs[decode_index[i]];
+		}
+
+		// Recover data
+		for (i = 0; i < TEST_SOURCES; i++) {
+			memset(temp_buffs[i], 0, TEST_LEN);
+		}
+		ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+		for (i = 0; i < k; i++) {
+			FUNCTION_UNDER_TEST(TEST_LEN, k, nerrs, i, g_tbls, recov[i],
+					    &temp_buffs[k]);
+		}
+
+		for (i = 0; i < nerrs; i++) {
+
+			if (0 !=
+			    memcmp(temp_buffs[k + i], update_buffs[src_err_list[i]],
+				   TEST_LEN)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (j = 0; j < nerrs; j++)
+					printf(" %d", src_err_list[j]);
+				printf(" - Index = ");
+				for (p = 0; p < k; p++)
+					printf(" %d", decode_index[p]);
+				printf("\nencode_matrix:\n");
+				dump_u8xu8((u8 *) encode_matrix, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((u8 *) invert_matrix, k, k);
+				printf("\ndecode_matrix:\n");
+				dump_u8xu8((u8 *) decode_matrix, m, k);
+				printf("orig data:\n");
+				dump_matrix(update_buffs, m, 25);
+				printf("orig   :");
+				dump(update_buffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_buffs[k + i], 25);
+				return -1;
+			}
+		}
+		putchar('.');
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	k = 16;
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : ALIGN_SIZE;
+	if (k > KMAX)
+		return -1;
+
+	for (rows = 1; rows <= 16; rows++) {
+		m = k + rows;
+		if (m > MMAX)
+			return -1;
+
+		for (i = k; i < TEST_SOURCES; i++) {
+			memset(buffs[i], 0, TEST_LEN);
+			memset(update_buffs[i], 0, TEST_LEN);
+		}
+		// Make random data
+		for (i = 0; i < k; i++) {
+			for (j = 0; j < TEST_LEN; j++) {
+				buffs[i][j] = rand();
+				update_buffs[i][j] = buffs[i][j];
+			}
+		}
+
+		for (size = 0; size <= TEST_SIZE; size += align) {
+			for (i = 0; i < m; i++) {	// Line up TEST_SIZE from end
+				efence_buffs[i] = buffs[i] + TEST_LEN - size;
+				efence_update_buffs[i] = update_buffs[i] + TEST_LEN - size;
+			}
+			// Zero the destination buffer for update function
+			for (i = k; i < m; i++) {
+				memset(efence_buffs[i], 0, size);
+				memset(efence_update_buffs[i], 0, size);
+			}
+
+			// The matrix generated by gf_gen_cauchy1_matrix
+			// is always invertable.
+			gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+			// Make parity vects
+			// Generate g_tbls from encode matrix a
+			ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+			// Perform matrix dot_prod for EC encoding
+			// using g_tbls from encode matrix a
+			REF_FUNCTION(size, k, m - k, g_tbls, efence_buffs, &efence_buffs[k]);
+			for (i = 0; i < k; i++) {
+				FUNCTION_UNDER_TEST(size, k, m - k, i, g_tbls,
+						    efence_update_buffs[i],
+						    &efence_update_buffs[k]);
+			}
+			for (i = 0; i < m - k; i++) {
+				if (0 !=
+				    memcmp(efence_update_buffs[k + i], efence_buffs[k + i],
+					   size)) {
+					printf("\nefence_update_buffs%d  :", i);
+					dump(efence_update_buffs[k + i], 25);
+					printf("efence_buffs%d         :", i);
+					dump(efence_buffs[k + i], 25);
+					return -1;
+				}
+			}
+
+			// Random errors
+			memset(src_in_err, 0, TEST_SOURCES);
+			gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+			// Generate decode matrix
+			re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+						  invert_matrix, decode_index, src_err_list,
+						  src_in_err, nerrs, nsrcerrs, k, m);
+			if (re != 0) {
+				printf("Fail to gf_gen_decode_matrix\n");
+				return -1;
+			}
+			// Pack recovery array as list of valid sources
+			// Its order must be the same as the order
+			// to generate matrix b in gf_gen_decode_matrix
+			for (i = 0; i < k; i++) {
+				recov[i] = efence_update_buffs[decode_index[i]];
+			}
+
+			// Recover data
+			for (i = 0; i < TEST_SOURCES; i++) {
+				memset(temp_buffs[i], 0, TEST_LEN);
+			}
+			ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+			for (i = 0; i < k; i++) {
+				FUNCTION_UNDER_TEST(size, k, nerrs, i, g_tbls, recov[i],
+						    &temp_buffs[k]);
+			}
+
+			for (i = 0; i < nerrs; i++) {
+
+				if (0 !=
+				    memcmp(temp_buffs[k + i],
+					   efence_update_buffs[src_err_list[i]], size)) {
+					printf("Efence: Fail error recovery (%d, %d, %d)\n", m,
+					       k, nerrs);
+
+					printf("size = %d\n", size);
+
+					printf("Test erase list = ");
+					for (j = 0; j < nerrs; j++)
+						printf(" %d", src_err_list[j]);
+					printf(" - Index = ");
+					for (p = 0; p < k; p++)
+						printf(" %d", decode_index[p]);
+					printf("\nencode_matrix:\n");
+					dump_u8xu8((u8 *) encode_matrix, m, k);
+					printf("inv b:\n");
+					dump_u8xu8((u8 *) invert_matrix, k, k);
+					printf("\ndecode_matrix:\n");
+					dump_u8xu8((u8 *) decode_matrix, m, k);
+
+					printf("recov %d:", src_err_list[i]);
+					dump(temp_buffs[k + i], align);
+					printf("orig   :");
+					dump(efence_update_buffs[src_err_list[i]], align);
+					return -1;
+				}
+			}
+		}
+		putchar('.');
+
+	}
+
+	// Test rand ptr alignment if available
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
+
+		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
+		// Add random offsets
+		for (i = 0; i < m; i++) {
+			memset(buffs[i], 0, TEST_LEN);	// zero pad to check write-over
+			memset(update_buffs[i], 0, TEST_LEN);	// zero pad to check write-over
+			memset(temp_buffs[i], 0, TEST_LEN);	// zero pad to check write-over
+			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+			update_ubuffs[i] =
+			    update_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+			temp_ubuffs[i] = temp_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+		}
+
+		// Zero the destination buffer for update function
+		for (i = k; i < m; i++) {
+			memset(ubuffs[i], 0, size);
+			memset(update_ubuffs[i], 0, size);
+		}
+		// Make random data
+		for (i = 0; i < k; i++) {
+			for (j = 0; j < size; j++) {
+				ubuffs[i][j] = rand();
+				update_ubuffs[i][j] = ubuffs[i][j];
+			}
+		}
+
+		// The matrix generated by gf_gen_cauchy1_matrix
+		// is always invertable.
+		gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+		// Make parity vects
+		// Generate g_tbls from encode matrix a
+		ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+		// Perform matrix dot_prod for EC encoding
+		// using g_tbls from encode matrix a
+		REF_FUNCTION(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]);
+		for (i = 0; i < k; i++) {
+			FUNCTION_UNDER_TEST(size, k, m - k, i, g_tbls, update_ubuffs[i],
+					    &update_ubuffs[k]);
+		}
+		for (i = 0; i < m - k; i++) {
+			if (0 != memcmp(update_ubuffs[k + i], ubuffs[k + i], size)) {
+				printf("\nupdate_ubuffs%d  :", i);
+				dump(update_ubuffs[k + i], 25);
+				printf("ubuffs%d         :", i);
+				dump(ubuffs[k + i], 25);
+				return -1;
+			}
+		}
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+
+		// Generate decode matrix
+		re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+					  invert_matrix, decode_index, src_err_list,
+					  src_in_err, nerrs, nsrcerrs, k, m);
+		if (re != 0) {
+			printf("Fail to gf_gen_decode_matrix\n");
+			return -1;
+		}
+		// Pack recovery array as list of valid sources
+		// Its order must be the same as the order
+		// to generate matrix b in gf_gen_decode_matrix
+		for (i = 0; i < k; i++) {
+			recov[i] = update_ubuffs[decode_index[i]];
+		}
+
+		// Recover data
+		for (i = 0; i < m; i++) {
+			memset(temp_ubuffs[i], 0, size);
+		}
+		ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+		for (i = 0; i < k; i++) {
+			FUNCTION_UNDER_TEST(size, k, nerrs, i, g_tbls, recov[i],
+					    &temp_ubuffs[k]);
+		}
+
+		for (i = 0; i < nerrs; i++) {
+
+			if (0 !=
+			    memcmp(temp_ubuffs[k + i], update_ubuffs[src_err_list[i]], size)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (j = 0; j < nerrs; j++)
+					printf(" %d", src_err_list[j]);
+				printf(" - Index = ");
+				for (p = 0; p < k; p++)
+					printf(" %d", decode_index[p]);
+				printf("\nencode_matrix:\n");
+				dump_u8xu8((unsigned char *)encode_matrix, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((unsigned char *)invert_matrix, k, k);
+				printf("\ndecode_matrix:\n");
+				dump_u8xu8((unsigned char *)decode_matrix, m, k);
+				printf("orig data:\n");
+				dump_matrix(update_ubuffs, m, 25);
+				printf("orig   :");
+				dump(update_ubuffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_ubuffs[k + i], 25);
+				return -1;
+			}
+		}
+
+		// Confirm that padding around dests is unchanged
+		memset(temp_buffs[0], 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
+
+		for (i = 0; i < m; i++) {
+
+			offset = update_ubuffs[i] - update_buffs[i];
+
+			if (memcmp(update_buffs[i], temp_buffs[0], offset)) {
+				printf("Fail rand ualign encode pad start\n");
+				return -1;
+			}
+			if (memcmp
+			    (update_buffs[i] + offset + size, temp_buffs[0],
+			     PTR_ALIGN_CHK_B - offset)) {
+				printf("Fail rand ualign encode pad end\n");
+				return -1;
+			}
+		}
+
+		for (i = 0; i < nerrs; i++) {
+
+			offset = temp_ubuffs[k + i] - temp_buffs[k + i];
+			if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
+				printf("Fail rand ualign decode pad start\n");
+				return -1;
+			}
+			if (memcmp
+			    (temp_buffs[k + i] + offset + size, temp_buffs[0],
+			     PTR_ALIGN_CHK_B - offset)) {
+				printf("Fail rand ualign decode pad end\n");
+				return -1;
+			}
+		}
+
+		putchar('.');
+	}
+
+	// Test size alignment
+
+	align = (LEN_ALIGN_CHK_B != 0) ? 13 : ALIGN_SIZE;
+
+	for (size = TEST_LEN; size >= 0; size -= align) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		// Zero the destination buffer for update function
+		for (i = k; i < TEST_SOURCES; i++) {
+			memset(buffs[i], 0, size);
+			memset(update_buffs[i], 0, size);
+		}
+		// Make random data
+		for (i = 0; i < k; i++) {
+			for (j = 0; j < size; j++) {
+				buffs[i][j] = rand();
+				update_buffs[i][j] = buffs[i][j];
+			}
+		}
+
+		// The matrix generated by gf_gen_cauchy1_matrix
+		// is always invertable.
+		gf_gen_cauchy1_matrix(encode_matrix, m, k);
+
+		// Make parity vects
+		// Generate g_tbls from encode matrix a
+		ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
+		// Perform matrix dot_prod for EC encoding
+		// using g_tbls from encode matrix a
+		REF_FUNCTION(size, k, m - k, g_tbls, buffs, &buffs[k]);
+		for (i = 0; i < k; i++) {
+			FUNCTION_UNDER_TEST(size, k, m - k, i, g_tbls, update_buffs[i],
+					    &update_buffs[k]);
+		}
+		for (i = 0; i < m - k; i++) {
+			if (0 != memcmp(update_buffs[k + i], buffs[k + i], size)) {
+				printf("\nupdate_buffs%d (size=%d)  :", i, size);
+				dump(update_buffs[k + i], 25);
+				printf("buffs%d (size=%d)         :", i, size);
+				dump(buffs[k + i], 25);
+				return -1;
+			}
+		}
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
+		// Generate decode matrix
+		re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
+					  invert_matrix, decode_index, src_err_list,
+					  src_in_err, nerrs, nsrcerrs, k, m);
+		if (re != 0) {
+			printf("Fail to gf_gen_decode_matrix\n");
+			return -1;
+		}
+		// Pack recovery array as list of valid sources
+		// Its order must be the same as the order
+		// to generate matrix b in gf_gen_decode_matrix
+		for (i = 0; i < k; i++) {
+			recov[i] = update_buffs[decode_index[i]];
+		}
+
+		// Recover data
+		for (i = 0; i < TEST_SOURCES; i++) {
+			memset(temp_buffs[i], 0, TEST_LEN);
+		}
+		ec_init_tables(k, nerrs, decode_matrix, g_tbls);
+		for (i = 0; i < k; i++) {
+			FUNCTION_UNDER_TEST(size, k, nerrs, i, g_tbls, recov[i],
+					    &temp_buffs[k]);
+		}
+
+		for (i = 0; i < nerrs; i++) {
+
+			if (0 !=
+			    memcmp(temp_buffs[k + i], update_buffs[src_err_list[i]], size)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (j = 0; j < nerrs; j++)
+					printf(" %d", src_err_list[j]);
+				printf(" - Index = ");
+				for (p = 0; p < k; p++)
+					printf(" %d", decode_index[p]);
+				printf("\nencode_matrix:\n");
+				dump_u8xu8((unsigned char *)encode_matrix, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((unsigned char *)invert_matrix, k, k);
+				printf("\ndecode_matrix:\n");
+				dump_u8xu8((unsigned char *)decode_matrix, m, k);
+				printf("orig data:\n");
+				dump_matrix(update_buffs, m, 25);
+				printf("orig   :");
+				dump(update_buffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_buffs[k + i], 25);
+				return -1;
+			}
+		}
+		putchar('.');
+	}
+
+	printf("done EC tests: Pass\n");
+	return 0;
+}
--- a/erasure_code/gf_2vect_dot_prod_avx.asm
+++ b/erasure_code/gf_2vect_dot_prod_avx.asm
@ -0,0 +1,337 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define tmp3  r9
+ %define tmp4  r12		; must be saved and restored
+ %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
+ %define PS 8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define tmp    r11
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  3*16 + 3*8 	; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_reg	r12,  3*16 + 0*8
+	save_reg	r13,  3*16 + 1*8
+	save_reg	r14,  3*16 + 2*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	mov	r12,  [rsp + 3*16 + 0*8]
+	mov	r13,  [rsp + 3*16 + 1*8]
+	mov	r14,  [rsp + 3*16 + 2*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans		;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp4    trans2
+ %define tmp4_m  var(0)
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*1	;1 local variable
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*1	;1 local variable
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define len   arg0
+%define vec   arg1
+%define mul_array arg2
+%define	src   arg3
+%define dest1  arg4
+
+%define vec_i tmp2
+%define ptr   tmp3
+%define dest2 tmp4
+%define pos   return
+
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp4_m
+ %endif
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+%ifidn PS,8			; 64-bit code
+ default rel
+  [bits 64]
+%endif
+
+section .text
+
+%ifidn PS,8			;64-bit code
+ %define xmask0f   xmm8
+ %define xgft1_lo  xmm7
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xmm5
+ %define xgft2_hi  xmm4
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+%else				;32-bit code
+ %define xmask0f   xmm4
+ %define xgft1_lo  xmm7
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+%endif
+
+align 16
+global gf_2vect_dot_prod_avx:function
+
+func(gf_2vect_dot_prod_avx)
+	FUNC_SAVE
+	SLDR	len, len_m
+	sub	len, 16
+	SSTR	len_m, len
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
+	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
+	mov	dest1, [dest1]
+	SSTR	dest1_m, dest1
+
+.loop16
+	vpxor	xp1, xp1
+	vpxor	xp2, xp2
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect
+	SLDR	src, src_m
+	mov	ptr, [src+vec_i]
+
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	vmovdqu	xgft1_hi, [tmp+16]	;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %ifidn PS,8				; 64-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
+	XLDR	x0, [ptr+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp1, xgft1_hi		;xp1 += partial
+
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
+	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	xp2, xgft2_hi		;xp2 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+
+	SLDR	len, len_m
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop16		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func                  core, ver, snum
+slversion gf_2vect_dot_prod_avx, 02,  05,  0191
--- a/erasure_code/gf_2vect_dot_prod_avx2.asm
+++ b/erasure_code/gf_2vect_dot_prod_avx2.asm
@ -0,0 +1,356 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2  r10
+ %define tmp3  r9
+ %define tmp4  r12		; must be saved and restored
+ %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  3*16 + 3*8 	; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqa	[rsp + 0*16], xmm6
+	vmovdqa	[rsp + 1*16], xmm7
+	vmovdqa	[rsp + 2*16], xmm8
+	save_reg	r12,  3*16 + 0*8
+	save_reg	r13,  3*16 + 1*8
+	save_reg	r14,  3*16 + 2*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	mov	r12,  [rsp + 3*16 + 0*8]
+	mov	r13,  [rsp + 3*16 + 1*8]
+	mov	r14,  [rsp + 3*16 + 2*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans			;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp.w   edx
+ %define tmp.b   dl
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp4    trans2
+ %define tmp4_m  var(0)
+ %define return  eax
+ %macro SLDR 	 2			;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*1		;1 local variable
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*1		;1 local variable
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define len   arg0
+%define vec   arg1
+%define mul_array arg2
+%define	src   arg3
+%define dest1 arg4
+
+%define vec_i tmp2
+%define ptr   tmp3
+%define dest2 tmp4
+%define pos   return
+
+%ifidn PS,4				;32-bit code
+ %define  len_m   arg0_m
+ %define  src_m   arg3_m
+ %define  dest1_m arg4_m
+ %define  dest2_m tmp4_m
+%endif
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+%ifidn PS,8				;64-bit code
+ default rel
+ [bits 64]
+%endif
+
+section .text
+
+%ifidn PS,8				;64-bit code
+ %define xmask0f   ymm8
+ %define xmask0fx  xmm8
+ %define xgft1_lo  ymm7
+ %define xgft1_hi  ymm6
+ %define xgft2_lo  ymm5
+ %define xgft2_hi  ymm4
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+%else					;32-bit code
+ %define xmask0f   ymm7
+ %define xmask0fx  xmm7
+ %define xgft1_lo  ymm5
+ %define xgft1_hi  ymm4
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+
+%endif
+
+align 16
+global gf_2vect_dot_prod_avx2:function
+
+func(gf_2vect_dot_prod_avx2)
+	FUNC_SAVE
+	SLDR	len, len_m
+	sub	len, 32
+	SSTR	len_m, len
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
+	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
+	mov	dest1, [dest1]
+	SSTR	dest1_m, dest1
+
+.loop32
+	vpxor	xp1, xp1
+	vpxor	xp2, xp2
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect
+	SLDR	src, src_m
+	mov	ptr, [src+vec_i]
+
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+					;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
+ %ifidn PS,8				; 64-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+
+	XLDR	x0, [ptr+pos]		;Get next source vector
+	add	tmp, 32
+	add	vec_i, PS
+ %else
+	XLDR	x0, [ptr+pos]		;Get next source vector
+ %endif
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp1, xgft1_hi		;xp1 += partial
+
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
+	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	xp2, xgft2_hi		;xp2 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+
+	SLDR	len, len_m
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop32		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func                   core, ver, snum
+slversion gf_2vect_dot_prod_avx2, 04,  05,  0196
--- a/erasure_code/gf_2vect_dot_prod_sse.asm
+++ b/erasure_code/gf_2vect_dot_prod_sse.asm
@ -0,0 +1,339 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define tmp3  r9
+ %define tmp4  r12		; must be saved and restored
+ %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
+ %define PS 8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define tmp    r11
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  3*16 + 3*8 	; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_reg	r12,  3*16 + 0*8
+	save_reg	r13,  3*16 + 1*8
+	save_reg	r14,  3*16 + 2*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	mov	r12,  [rsp + 3*16 + 0*8]
+	mov	r13,  [rsp + 3*16 + 1*8]
+	mov	r14,  [rsp + 3*16 + 2*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans			;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp4    trans2
+ %define tmp4_m  var(0)
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub 	esp, PS*1		;1 local variable
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*1		;1 local variable
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define len   arg0
+%define vec   arg1
+%define mul_array arg2
+%define	src   arg3
+%define dest1  arg4
+
+%define vec_i tmp2
+%define ptr   tmp3
+%define dest2 tmp4
+%define pos   return
+
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	 arg0_m
+	%define  src_m 	 arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp4_m
+ %endif
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+%ifidn PS,8				;64-bit code
+ default rel
+  [bits 64]
+%endif
+
+section .text
+
+%ifidn PS,8				;64-bit code
+ %define xmask0f   xmm8
+ %define xgft1_lo  xmm7
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xmm5
+ %define xgft2_hi  xmm4
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+%else					;32-bit code
+ %define xmask0f   xmm4
+ %define xgft1_lo  xmm7
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+%endif
+
+align 16
+global gf_2vect_dot_prod_sse:function
+
+func(gf_2vect_dot_prod_sse)
+	FUNC_SAVE
+	SLDR 	len, len_m
+	sub	len, 16
+	SSTR	len_m, len
+	jl	.return_fail
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR 	dest1, dest1_m
+	mov	dest2, [dest1+PS]
+	SSTR 	dest2_m, dest2
+	mov	dest1, [dest1]
+	SSTR 	dest1_m, dest1
+
+.loop16
+	pxor	xp1, xp1
+	pxor	xp2, xp2
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect
+	SLDR 	src, src_m
+	mov	ptr, [src+vec_i]
+
+	movdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	movdqu	xgft1_hi, [tmp+16]	;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %ifidn PS,8				;64-bit code
+	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
+	XLDR	x0, [ptr+pos]		;Get next source vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	pshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	pxor	xp1, xgft1_hi		;xp1 += partial
+
+ %ifidn PS,4				;32-bit code
+	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
+	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	pxor	xp2, xgft2_hi		;xp2 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	SLDR 	dest1, dest1_m
+	SLDR 	dest2, dest2_m
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+
+	SLDR 	len, len_m
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop16		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func                  core, ver, snum
+slversion gf_2vect_dot_prod_sse, 00,  04,  0062
--- a/erasure_code/gf_2vect_dot_prod_sse_perf.c
+++ b/erasure_code/gf_2vect_dot_prod_sse_perf.c
@ -0,0 +1,216 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "test.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_2vect_dot_prod_sse
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   40000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS 1000
+#  endif
+# endif
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j;
+	void *buf;
+	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g_tbls[2 * TEST_SOURCES * 32];
+	u8 *dest1, *dest2, *dest_ref1, *dest_ref2, *dest_ptrs[2];
+	u8 *buffs[TEST_SOURCES];
+	struct perf start, stop;
+
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref2 = buf;
+
+	dest_ptrs[0] = dest1;
+	dest_ptrs[1] = dest2;
+
+	// Performance test
+	for (i = 0; i < TEST_SOURCES; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	memset(dest1, 0, TEST_LEN);
+	memset(dest2, 0, TEST_LEN);
+	memset(dest_ref1, 0, TEST_LEN);
+	memset(dest_ref2, 0, TEST_LEN);
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		g1[i] = rand();
+		g2[i] = rand();
+	}
+
+	for (j = 0; j < TEST_SOURCES; j++) {
+		gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+		gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+	}
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
+			      dest_ref2);
+
+#ifdef DO_REF_PERF
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS / 100; i++) {
+		for (j = 0; j < TEST_SOURCES; j++) {
+			gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+			gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+		}
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      buffs, dest_ref2);
+	}
+	perf_stop(&stop);
+	printf("gf_2vect_dot_prod_base" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 2) * i);
+#endif
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		for (j = 0; j < TEST_SOURCES; j++) {
+			gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+			gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+		}
+
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 2) * i);
+
+	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref1, 25);
+		printf("dprod_dut:");
+		dump(dest1, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref2, 25);
+		printf("dprod_dut:");
+		dump(dest2, 25);
+		return -1;
+	}
+
+	printf("pass perf check\n");
+	return 0;
+
+}
--- a/erasure_code/gf_2vect_dot_prod_sse_test.c
+++ b/erasure_code/gf_2vect_dot_prod_sse_test.c
@ -0,0 +1,477 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_2vect_dot_prod_sse
+#endif
+#ifndef TEST_MIN_SIZE
+# define TEST_MIN_SIZE  16
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+#define TEST_MEM  TEST_SIZE
+#define TEST_LOOPS 10000
+#define TEST_TYPE_STR ""
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  16
+#endif
+#ifndef RANDOMS
+# define RANDOMS 20
+#endif
+
+#ifdef EC_ALIGNED_ADDR
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 0
+# define LEN_ALIGN_CHK_B 0	// 0 for aligned only
+#else
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 32
+# define LEN_ALIGN_CHK_B 32	// 0 for aligned only
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, srcs;
+	void *buf;
+	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g_tbls[2 * TEST_SOURCES * 32];
+	u8 *dest1, *dest2, *dest_ref1, *dest_ref2, *dest_ptrs[2];
+	u8 *buffs[TEST_SOURCES];
+
+	int align, size;
+	unsigned char *efence_buffs[TEST_SOURCES];
+	unsigned int offset;
+	u8 *ubuffs[TEST_SOURCES];
+	u8 *udest_ptrs[2];
+
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref2 = buf;
+
+	dest_ptrs[0] = dest1;
+	dest_ptrs[1] = dest2;
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	memset(dest1, 0, TEST_LEN);
+	memset(dest2, 0, TEST_LEN);
+	memset(dest_ref1, 0, TEST_LEN);
+	memset(dest_ref2, 0, TEST_LEN);
+	memset(g1, 2, TEST_SOURCES);
+	memset(g2, 1, TEST_SOURCES);
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+		gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
+	}
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
+			      dest_ref2);
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref1, 25);
+		printf("dprod_dut:");
+		dump(dest1, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref2, 25);
+		printf("dprod_dut:");
+		dump(dest2, 25);
+		return -1;
+	}
+
+	putchar('.');
+
+	// Rand data test
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+		}
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      buffs, dest_ref2);
+
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+		if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(dest1, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(dest2, 25);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Rand data test with varied parameters
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
+			for (i = 0; i < srcs; i++)
+				for (j = 0; j < TEST_LEN; j++)
+					buffs[i][j] = rand();
+
+			for (i = 0; i < srcs; i++) {
+				g1[i] = rand();
+				g2[i] = rand();
+			}
+
+			for (i = 0; i < srcs; i++) {
+				gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+				gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+			}
+
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
+					      dest_ref2);
+
+			FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
+
+			if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test1 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref1, 25);
+				printf("dprod_dut:");
+				dump(dest1, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test2 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref2, 25);
+				printf("dprod_dut:");
+				dump(dest2, 25);
+				return -1;
+			}
+
+			putchar('.');
+		}
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+	for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)	// Line up TEST_SIZE from end
+			efence_buffs[i] = buffs[i] + TEST_LEN - size;
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+		}
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      efence_buffs, dest_ref2);
+
+		FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
+
+		if (0 != memcmp(dest_ref1, dest1, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, align);
+			printf("dprod_dut:");
+			dump(dest1, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref2, dest2, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, align);
+			printf("dprod_dut:");
+			dump(dest2, align);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test rand ptr alignment if available
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
+		srcs = rand() % TEST_SOURCES;
+		if (srcs == 0)
+			continue;
+
+		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
+		// Add random offsets
+		for (i = 0; i < srcs; i++)
+			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		memset(dest1, 0, TEST_LEN);	// zero pad to check write-over
+		memset(dest2, 0, TEST_LEN);
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				ubuffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+		}
+
+		for (i = 0; i < srcs; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
+
+		if (memcmp(dest_ref1, udest_ptrs[0], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[0], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref2, udest_ptrs[1], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[1], 25);
+			return -1;
+		}
+		// Confirm that padding around dests is unchanged
+		memset(dest_ref1, 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
+		offset = udest_ptrs[0] - dest1;
+
+		if (memcmp(dest1, dest_ref1, offset)) {
+			printf("Fail rand ualign pad1 start\n");
+			return -1;
+		}
+		if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad1 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[1] - dest2;
+		if (memcmp(dest2, dest_ref1, offset)) {
+			printf("Fail rand ualign pad2 start\n");
+			return -1;
+		}
+		if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad2 end\n");
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test all size alignment
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+
+	for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
+		srcs = TEST_SOURCES;
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+		}
+
+		for (i = 0; i < srcs; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
+
+		if (memcmp(dest_ref1, dest_ptrs[0], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[0], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref2, dest_ptrs[1], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[1], 25);
+			return -1;
+		}
+	}
+
+	printf("Pass\n");
+	return 0;
+
+}
--- a/erasure_code/gf_2vect_mad_avx.asm
+++ b/erasure_code/gf_2vect_mad_avx.asm
@ -0,0 +1,236 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*9 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r15,  9*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r15,  [rsp + 9*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_2vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp2
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm14
+%define xgft1_lo  xmm13
+%define xgft1_hi  xmm12
+%define xgft2_lo  xmm11
+%define xgft2_hi  xmm10
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xd1     xmm6
+%define xd2     xmm7
+%define xtmpd1  xmm8
+%define xtmpd2  xmm9
+
+
+align 16
+global gf_2vect_mad_avx:function
+
+func(gf_2vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xgft1_hi, [tmp+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xgft2_hi, [tmp+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+	XLDR	xtmpd1, [dest1+len]	;backup the last 16 bytes in dest
+	XLDR	xtmpd2, [dest2+len]	;backup the last 16 bytes in dest
+
+.loop16
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+	XLDR	xd2, [dest2+pos]		;Get next dest vector
+.loop16_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph1, xgft1_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1	;xd1 += partial
+
+	vpshufb	xtmph2, xgft2_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2	;xd2 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	vmovdqa	xd1, xtmpd1	;Restore xd1
+	vmovdqa	xd2, xtmpd2	;Restore xd2
+	jmp	.loop16_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func             core, ver, snum
+slversion gf_2vect_mad_avx, 02,  01,  0204
--- a/erasure_code/gf_2vect_mad_avx2.asm
+++ b/erasure_code/gf_2vect_mad_avx2.asm
@ -0,0 +1,247 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+ %define stack_size  16*9 + 3*8 	; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	vmovdqa	[rsp+16*3],xmm9
+	vmovdqa	[rsp+16*4],xmm10
+	vmovdqa	[rsp+16*5],xmm11
+	vmovdqa	[rsp+16*6],xmm12
+	vmovdqa	[rsp+16*7],xmm13
+	vmovdqa	[rsp+16*8],xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r15,  9*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	vmovdqa	xmm9, [rsp+16*3]
+	vmovdqa	xmm10, [rsp+16*4]
+	vmovdqa	xmm11, [rsp+16*5]
+	vmovdqa	xmm12, [rsp+16*6]
+	vmovdqa	xmm13, [rsp+16*7]
+	vmovdqa	xmm14, [rsp+16*8]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r15,  [rsp + 9*16 + 1*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_2vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp2
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm14
+%define xmask0fx  xmm14
+%define xgft1_lo  ymm13
+%define xgft1_hi  ymm12
+%define xgft2_lo  ymm11
+%define xgft2_hi  ymm10
+
+%define x0      ymm0
+%define xtmpa   ymm1
+%define xtmph1  ymm2
+%define xtmpl1  ymm3
+%define xtmph2  ymm4
+%define xtmpl2  ymm5
+%define xd1     ymm6
+%define xd2     ymm7
+%define xtmpd1  ymm8
+%define xtmpd2  ymm9
+
+align 16
+global gf_2vect_mad_avx2:function
+
+func(gf_2vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+					;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+					;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest1, [dest1]
+
+	XLDR	xtmpd1, [dest1+len]	;backup the last 16 bytes in dest
+	XLDR	xtmpd2, [dest2+len]	;backup the last 16 bytes in dest
+
+.loop32
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+	XLDR	xd2, [dest2+pos]		;Get next dest vector
+.loop32_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph1, xgft1_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1	;xd1 += partial
+
+	vpshufb	xtmph2, xgft2_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2	;xd2 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-32
+	vmovdqa	xd1, xtmpd1	;Restore xd1
+	vmovdqa	xd2, xtmpd2	;Restore xd2
+	jmp	.loop32_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func              core, ver, snum
+slversion gf_2vect_mad_avx2, 04,  01,  0205
--- a/erasure_code/gf_2vect_mad_sse.asm
+++ b/erasure_code/gf_2vect_mad_sse.asm
@ -0,0 +1,239 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*9 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r15,  9*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r15,  [rsp + 9*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp2
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm14
+%define xgft1_lo  xmm13
+%define xgft1_hi  xmm12
+%define xgft2_lo  xmm11
+%define xgft2_hi  xmm10
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xd1     xmm6
+%define xd2     xmm7
+%define xtmpd1  xmm8
+%define xtmpd2  xmm9
+
+
+align 16
+global gf_2vect_mad_sse:function
+func(gf_2vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	movdqu	xgft1_lo,[tmp]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xgft1_hi, [tmp+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xgft2_hi, [tmp+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+	XLDR	xtmpd1, [dest1+len]	;backup the last 16 bytes in dest
+	XLDR	xtmpd2, [dest2+len]	;backup the last 16 bytes in dest
+
+.loop16:
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+	XLDR	xd2, [dest2+pos]		;Get next dest vector
+.loop16_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+	movdqa	xtmph1, xgft1_hi		;Reload const array registers
+	movdqa	xtmpl1, xgft1_lo
+	movdqa	xtmph2, xgft2_hi		;Reload const array registers
+	movdqa	xtmpl2, xgft2_lo
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	XSTR	[dest1+pos], xd1	;Store result
+	XSTR	[dest2+pos], xd2	;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	movdqa	xd1, xtmpd1	;Restore xd1
+	movdqa	xd2, xtmpd2	;Restore xd2
+	jmp	.loop16_overlap	;Do one more overlap pass
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func             core, ver, snum
+slversion gf_2vect_mad_sse, 00,  01,  0203
--- a/erasure_code/gf_3vect_dot_prod_avx.asm
+++ b/erasure_code/gf_3vect_dot_prod_avx.asm
@ -0,0 +1,377 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
+ %define PS 8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  6*16 + 5*8 	; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_reg	r12,  6*16 + 0*8
+	save_reg	r13,  6*16 + 1*8
+	save_reg	r14,  6*16 + 2*8
+	save_reg	r15,  6*16 + 3*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	mov	r12,  [rsp + 6*16 + 0*8]
+	mov	r13,  [rsp + 6*16 + 1*8]
+	mov	r14,  [rsp + 6*16 + 2*8]
+	mov	r15,  [rsp + 6*16 + 3*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans		;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp3_m  var(0)
+ %define tmp4    trans2
+ %define tmp4_m  var(1)
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*2		;2 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*2		;2 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define len   arg0
+%define vec   arg1
+%define mul_array arg2
+%define	src   arg3
+%define dest1  arg4
+%define ptr   arg5
+
+%define vec_i tmp2
+%define dest2 tmp3
+%define dest3 tmp4
+%define pos   return
+
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+ %endif
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+%ifidn PS,8			; 64-bit code
+ default rel
+  [bits 64]
+%endif
+
+
+section .text
+
+%ifidn PS,8			;64-bit code
+ %define xmask0f   xmm11
+ %define xgft1_lo  xmm10
+ %define xgft1_hi  xmm9
+ %define xgft2_lo  xmm8
+ %define xgft2_hi  xmm7
+ %define xgft3_lo  xmm6
+ %define xgft3_hi  xmm5
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+%else
+ %define xmask0f   xmm7
+ %define xgft1_lo  xmm6
+ %define xgft1_hi  xmm5
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+%endif
+
+align 16
+global gf_3vect_dot_prod_avx:function
+func(gf_3vect_dot_prod_avx)
+	FUNC_SAVE
+	SLDR	len, len_m
+	sub	len, 16
+	SSTR	len_m, len
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
+	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
+	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
+	mov	dest1, [dest1]
+	SSTR	dest1_m, dest1
+
+.loop16:
+	vpxor	xp1, xp1
+	vpxor	xp2, xp2
+	vpxor	xp3, xp3
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect:
+	SLDR	src, src_m
+	mov	ptr, [src+vec_i]
+
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	vmovdqu	xgft1_hi, [tmp+16]	;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %ifidn PS,8				; 64-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	vmovdqu	xgft3_hi, [tmp+vec*(64/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
+	XLDR	x0, [ptr+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp1, xgft1_hi		;xp1 += partial
+
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+ %endif
+	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	xp2, xgft2_hi		;xp2 += partial
+
+ %ifidn PS,4				; 32-bit code
+	sal	vec, 1
+	vmovdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	vmovdqu	xgft3_hi, [tmp+vec*(32/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	sar	vec, 1
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
+	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpxor	xp3, xgft3_hi		;xp3 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
+	XSTR	[dest3+pos], xp3
+
+	SLDR	len, len_m
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop16		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func                  core, ver, snum
+slversion gf_3vect_dot_prod_avx, 02,  05,  0192
--- a/erasure_code/gf_3vect_dot_prod_avx2.asm
+++ b/erasure_code/gf_3vect_dot_prod_avx2.asm
@ -0,0 +1,397 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  6*16 + 5*8 	; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqa	[rsp + 0*16], xmm6
+	vmovdqa	[rsp + 1*16], xmm7
+	vmovdqa	[rsp + 2*16], xmm8
+	vmovdqa	[rsp + 3*16], xmm9
+	vmovdqa	[rsp + 4*16], xmm10
+	vmovdqa	[rsp + 5*16], xmm11
+	save_reg	r12,  6*16 + 0*8
+	save_reg	r13,  6*16 + 1*8
+	save_reg	r14,  6*16 + 2*8
+	save_reg	r15,  6*16 + 3*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	mov	r12,  [rsp + 6*16 + 0*8]
+	mov	r13,  [rsp + 6*16 + 1*8]
+	mov	r14,  [rsp + 6*16 + 2*8]
+	mov	r15,  [rsp + 6*16 + 3*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans			;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp.w   edx
+ %define tmp.b   dl
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp3_m  var(0)
+ %define tmp4    trans2
+ %define tmp4_m  var(1)
+ %define return  eax
+ %macro SLDR     2			;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*2		;2 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*2		;2 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define len   arg0
+%define vec   arg1
+%define mul_array arg2
+%define	src   arg3
+%define dest1 arg4
+%define ptr   arg5
+
+%define vec_i tmp2
+%define dest2 tmp3
+%define dest3 tmp4
+%define pos   return
+
+%ifidn PS,4				;32-bit code
+ %define  len_m   arg0_m
+ %define  src_m   arg3_m
+ %define  dest1_m arg4_m
+ %define  dest2_m tmp3_m
+ %define  dest3_m tmp4_m
+%endif
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+%ifidn PS,8				;64-bit code
+ default rel
+ [bits 64]
+%endif
+
+section .text
+
+%ifidn PS,8				;64-bit code
+ %define xmask0f   ymm11
+ %define xmask0fx  xmm11
+ %define xgft1_lo  ymm10
+ %define xgft1_hi  ymm9
+ %define xgft2_lo  ymm8
+ %define xgft2_hi  ymm7
+ %define xgft3_lo  ymm6
+ %define xgft3_hi  ymm5
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+ %define xp3    ymm4
+%else
+ %define xmask0f   ymm7
+ %define xmask0fx  xmm7
+ %define xgft1_lo  ymm6
+ %define xgft1_hi  ymm5
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+ %define xp3    ymm4
+
+%endif
+
+align 16
+global gf_3vect_dot_prod_avx2:function
+func(gf_3vect_dot_prod_avx2)
+	FUNC_SAVE
+	SLDR	len, len_m
+	sub	len, 32
+	SSTR	len_m, len
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
+	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
+	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
+	mov	dest1, [dest1]
+	SSTR	dest1_m, dest1
+
+.loop32:
+	vpxor	xp1, xp1
+	vpxor	xp2, xp2
+	vpxor	xp3, xp3
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect:
+	SLDR	src, src_m
+	mov	ptr, [src+vec_i]
+
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+					;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
+ %ifidn PS,8				; 64-bit code
+	vmovdqu	   xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+
+	vmovdqu	   xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
+
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
+	XLDR	x0, [ptr+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp1, xgft1_hi		;xp1 += partial
+
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+ %endif
+	vpshufb	   xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	   xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	   xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	   xp2, xgft2_hi		;xp2 += partial
+
+ %ifidn PS,4				; 32-bit code
+	sal     vec, 1
+	vmovdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
+	sar	vec, 1
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
+	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpxor	xp3, xgft3_hi		;xp3 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
+	XSTR	[dest3+pos], xp3
+
+	SLDR	len, len_m
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop32		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func                   core, ver, snum
+slversion gf_3vect_dot_prod_avx2, 04,  05,  0197
--- a/erasure_code/gf_3vect_dot_prod_sse.asm
+++ b/erasure_code/gf_3vect_dot_prod_sse.asm
@ -0,0 +1,378 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
+ %define PS 8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  6*16 + 5*8 	; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_reg	r12,  6*16 + 0*8
+	save_reg	r13,  6*16 + 1*8
+	save_reg	r14,  6*16 + 2*8
+	save_reg	r15,  6*16 + 3*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	mov	r12,  [rsp + 6*16 + 0*8]
+	mov	r13,  [rsp + 6*16 + 1*8]
+	mov	r14,  [rsp + 6*16 + 2*8]
+	mov	r15,  [rsp + 6*16 + 3*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans	 ecx
+ %define trans2  esi
+ %define arg0	 trans		;trans and trans2 are for the variables in stack
+ %define arg0_m	 arg(0)
+ %define arg1	 ebx
+ %define arg2	 arg2_m
+ %define arg2_m	 arg(2)
+ %define arg3	 trans
+ %define arg3_m	 arg(3)
+ %define arg4	 trans
+ %define arg4_m	 arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp2	 edi
+ %define tmp3	 trans2
+ %define tmp3_m	 var(0)
+ %define tmp4	 trans2
+ %define tmp4_m	 var(1)
+ %define return	 eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*2		;2 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*2		;2 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define len   arg0
+%define vec   arg1
+%define mul_array arg2
+%define	src   arg3
+%define dest1  arg4
+%define ptr   arg5
+
+%define vec_i tmp2
+%define dest2 tmp3
+%define dest3 tmp4
+%define pos   return
+
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+ %endif
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+%ifidn PS,8				; 64-bit code
+ default rel
+  [bits 64]
+%endif
+
+
+section .text
+
+%ifidn PS,8				;64-bit code
+ %define xmask0f   xmm11
+ %define xgft1_lo  xmm2
+ %define xgft1_hi  xmm3
+ %define xgft2_lo  xmm4
+ %define xgft2_hi  xmm7
+ %define xgft3_lo  xmm6
+ %define xgft3_hi  xmm5
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm10
+ %define xp2    xmm9
+ %define xp3    xmm8
+%else
+ %define xmask0f   xmm7
+ %define xgft1_lo  xmm6
+ %define xgft1_hi  xmm5
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+%endif
+
+align 16
+global gf_3vect_dot_prod_sse:function
+func(gf_3vect_dot_prod_sse)
+	FUNC_SAVE
+	SLDR	len, len_m
+	sub	len, 16
+	SSTR	len_m, len
+	jl	.return_fail
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
+	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
+	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
+	mov	dest1, [dest1]
+	SSTR	dest1_m, dest1
+
+.loop16:
+	pxor	xp1, xp1
+	pxor	xp2, xp2
+	pxor	xp3, xp3
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect:
+	SLDR src, src_m
+	mov	ptr, [src+vec_i]
+
+	movdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	movdqu	xgft1_hi, [tmp+16]	;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %ifidn PS,8				;64-bit code
+	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	movdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	movdqu	xgft3_hi, [tmp+vec*(64/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
+	XLDR	x0, [ptr+pos]		;Get next source vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	pshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	pxor	xp1, xgft1_hi		;xp1 += partial
+
+ %ifidn PS,4				;32-bit code
+	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+ %endif
+	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	pxor	xp2, xgft2_hi		;xp2 += partial
+
+ %ifidn PS,4				;32-bit code
+	sal	vec, 1
+	movdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	movdqu	xgft3_hi, [tmp+vec*(32/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	sar 	vec, 1
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
+	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	pxor	xp3, xgft3_hi		;xp3 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
+	XSTR	[dest3+pos], xp3
+
+	SLDR	len, len_m
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop16		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func                  core, ver, snum
+slversion gf_3vect_dot_prod_sse, 00,  06,  0063
--- a/erasure_code/gf_3vect_dot_prod_sse_perf.c
+++ b/erasure_code/gf_3vect_dot_prod_sse_perf.c
@ -0,0 +1,246 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "test.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_3vect_dot_prod_sse
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   40000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS 1000
+#  endif
+# endif
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j;
+	void *buf;
+	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
+	u8 g_tbls[3 * TEST_SOURCES * 32], *dest_ptrs[3], *buffs[TEST_SOURCES];
+	u8 *dest1, *dest2, *dest3, *dest_ref1, *dest_ref2, *dest_ref3;
+	struct perf start, stop;
+
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref3 = buf;
+
+	dest_ptrs[0] = dest1;
+	dest_ptrs[1] = dest2;
+	dest_ptrs[2] = dest3;
+
+	// Performance test
+	for (i = 0; i < TEST_SOURCES; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	memset(dest1, 0, TEST_LEN);
+	memset(dest2, 0, TEST_LEN);
+	memset(dest_ref1, 0, TEST_LEN);
+	memset(dest_ref2, 0, TEST_LEN);
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		g1[i] = rand();
+		g2[i] = rand();
+		g3[i] = rand();
+	}
+
+	for (j = 0; j < TEST_SOURCES; j++) {
+		gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+		gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+		gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+	}
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
+			      dest_ref2);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
+			      dest_ref3);
+
+#ifdef DO_REF_PERF
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS / 100; i++) {
+		for (j = 0; j < TEST_SOURCES; j++) {
+			gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+			gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+		}
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      buffs, dest_ref2);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
+				      buffs, dest_ref3);
+	}
+	perf_stop(&stop);
+	printf("gf_3vect_dot_prod_base" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 3) * i);
+#endif
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		for (j = 0; j < TEST_SOURCES; j++) {
+			gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+			gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+		}
+
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 3) * i);
+
+	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref1, 25);
+		printf("dprod_dut:");
+		dump(dest1, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref2, 25);
+		printf("dprod_dut:");
+		dump(dest2, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref3, 25);
+		printf("dprod_dut:");
+		dump(dest3, 25);
+		return -1;
+	}
+
+	printf("pass perf check\n");
+	return 0;
+
+}
--- a/erasure_code/gf_3vect_dot_prod_sse_test.c
+++ b/erasure_code/gf_3vect_dot_prod_sse_test.c
@ -0,0 +1,583 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_3vect_dot_prod_sse
+#endif
+#ifndef TEST_MIN_SIZE
+# define TEST_MIN_SIZE  16
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+#define TEST_MEM  TEST_SIZE
+#define TEST_LOOPS 10000
+#define TEST_TYPE_STR ""
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  16
+#endif
+#ifndef RANDOMS
+# define RANDOMS 20
+#endif
+
+#ifdef EC_ALIGNED_ADDR
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 0
+# define LEN_ALIGN_CHK_B 0	// 0 for aligned only
+#else
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 32
+# define LEN_ALIGN_CHK_B 32	// 0 for aligned only
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, srcs;
+	void *buf;
+	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
+	u8 g_tbls[3 * TEST_SOURCES * 32], *dest_ptrs[3], *buffs[TEST_SOURCES];
+	u8 *dest1, *dest2, *dest3, *dest_ref1, *dest_ref2, *dest_ref3;
+
+	int align, size;
+	unsigned char *efence_buffs[TEST_SOURCES];
+	unsigned int offset;
+	u8 *ubuffs[TEST_SOURCES];
+	u8 *udest_ptrs[3];
+	printf(xstr(FUNCTION_UNDER_TEST) "_test: %dx%d ", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");;
+		return -1;
+	}
+	dest_ref2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref3 = buf;
+
+	dest_ptrs[0] = dest1;
+	dest_ptrs[1] = dest2;
+	dest_ptrs[2] = dest3;
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	memset(dest1, 0, TEST_LEN);
+	memset(dest2, 0, TEST_LEN);
+	memset(dest3, 0, TEST_LEN);
+	memset(dest_ref1, 0, TEST_LEN);
+	memset(dest_ref2, 0, TEST_LEN);
+	memset(dest_ref3, 0, TEST_LEN);
+	memset(g1, 2, TEST_SOURCES);
+	memset(g2, 1, TEST_SOURCES);
+	memset(g3, 7, TEST_SOURCES);
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+		gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
+		gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
+	}
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
+			      dest_ref2);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
+			      dest_ref3);
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+		printf("Fail zero" xstr(FUNCTION_UNDER_TEST) " test1\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref1, 25);
+		printf("dprod_dut:");
+		dump(dest1, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref2, 25);
+		printf("dprod_dut:");
+		dump(dest2, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref3, 25);
+		printf("dprod_dut:");
+		dump(dest3, 25);
+		return -1;
+	}
+
+	putchar('.');
+
+	// Rand data test
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+		}
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      buffs, dest_ref2);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
+				      buffs, dest_ref3);
+
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+		if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(dest1, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(dest2, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, 25);
+			printf("dprod_dut:");
+			dump(dest3, 25);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Rand data test with varied parameters
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
+			for (i = 0; i < srcs; i++)
+				for (j = 0; j < TEST_LEN; j++)
+					buffs[i][j] = rand();
+
+			for (i = 0; i < srcs; i++) {
+				g1[i] = rand();
+				g2[i] = rand();
+				g3[i] = rand();
+			}
+
+			for (i = 0; i < srcs; i++) {
+				gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+				gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+				gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
+			}
+
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
+					      dest_ref2);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
+					      dest_ref3);
+
+			FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
+
+			if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test1 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref1, 25);
+				printf("dprod_dut:");
+				dump(dest1, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test2 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref2, 25);
+				printf("dprod_dut:");
+				dump(dest2, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test3 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref3, 25);
+				printf("dprod_dut:");
+				dump(dest3, 25);
+				return -1;
+			}
+
+			putchar('.');
+		}
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+	for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)	// Line up TEST_SIZE from end
+			efence_buffs[i] = buffs[i] + TEST_LEN - size;
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+		}
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      efence_buffs, dest_ref2);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
+				      efence_buffs, dest_ref3);
+
+		FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
+
+		if (0 != memcmp(dest_ref1, dest1, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, align);
+			printf("dprod_dut:");
+			dump(dest1, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref2, dest2, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, align);
+			printf("dprod_dut:");
+			dump(dest2, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref3, dest3, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, align);
+			printf("dprod_dut:");
+			dump(dest3, align);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test rand ptr alignment if available
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
+		srcs = rand() % TEST_SOURCES;
+		if (srcs == 0)
+			continue;
+
+		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
+		// Add random offsets
+		for (i = 0; i < srcs; i++)
+			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		memset(dest1, 0, TEST_LEN);	// zero pad to check write-over
+		memset(dest2, 0, TEST_LEN);
+		memset(dest3, 0, TEST_LEN);
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				ubuffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+		}
+
+		for (i = 0; i < srcs; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
+
+		if (memcmp(dest_ref1, udest_ptrs[0], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[0], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref2, udest_ptrs[1], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[1], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref3, udest_ptrs[2], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[2], 25);
+			return -1;
+		}
+		// Confirm that padding around dests is unchanged
+		memset(dest_ref1, 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
+		offset = udest_ptrs[0] - dest1;
+
+		if (memcmp(dest1, dest_ref1, offset)) {
+			printf("Fail rand ualign pad1 start\n");
+			return -1;
+		}
+		if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad1 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[1] - dest2;
+		if (memcmp(dest2, dest_ref1, offset)) {
+			printf("Fail rand ualign pad2 start\n");
+			return -1;
+		}
+		if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad2 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[2] - dest3;
+		if (memcmp(dest3, dest_ref1, offset)) {
+			printf("Fail rand ualign pad3 start\n");
+			return -1;
+		}
+		if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad3 end\n");;
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test all size alignment
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+
+	for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
+		srcs = TEST_SOURCES;
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+		}
+
+		for (i = 0; i < srcs; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
+
+		if (memcmp(dest_ref1, dest_ptrs[0], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[0], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref2, dest_ptrs[1], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[1], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref3, dest_ptrs[2], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[2], 25);
+			return -1;
+		}
+	}
+
+	printf("Pass\n");
+	return 0;
+
+}
--- a/erasure_code/gf_3vect_mad_avx.asm
+++ b/erasure_code/gf_3vect_mad_avx.asm
@ -0,0 +1,288 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	vmovdqa	[rsp+16*3],xmm9
+	vmovdqa	[rsp+16*4],xmm10
+	vmovdqa	[rsp+16*5],xmm11
+	vmovdqa	[rsp+16*6],xmm12
+	vmovdqa	[rsp+16*7],xmm13
+	vmovdqa	[rsp+16*8],xmm14
+	vmovdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	vmovdqa	xmm9, [rsp+16*3]
+	vmovdqa	xmm10, [rsp+16*4]
+	vmovdqa	xmm11, [rsp+16*5]
+	vmovdqa	xmm12, [rsp+16*6]
+	vmovdqa	xmm13, [rsp+16*7]
+	vmovdqa	xmm14, [rsp+16*8]
+	vmovdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft1_lo  xmm14
+%define xgft1_hi  xmm13
+%define xgft2_lo  xmm12
+%define xgft2_hi  xmm11
+%define xgft3_lo  xmm10
+%define xgft3_hi  xmm9
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xd1     xmm8
+%define xd2     xtmpl1
+%define xd3     xtmph1
+
+align 16
+global gf_3vect_mad_avx:function
+func(gf_3vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xgft1_hi, [tmp+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xgft2_hi, [tmp+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xgft3_hi, [tmp+2*vec+16]; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest3, [dest1+2*PS]	; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	XLDR	xd2, [dest2+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+pos]	;reuse xtmph1. Get next dest vector
+
+	; dest2
+	vpshufb	xtmph2, xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3		;xd3 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len		;Overlapped offset length-16
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	movdqa	xtmph3, [constip16]	;Load const of i + 16
+	vpinsrb	xtmpl3, xtmpl3, len.w, 15
+	vpshufb	xtmpl3, xtmpl3, xmask0f		;Broadcast len to all bytes
+	vpcmpgtb	xtmpl3, xtmpl3, xtmph3
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xgft1_hi, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpand	xgft1_hi, xgft1_hi, xtmpl3
+	vpxor	xd1, xd1, xgft1_hi
+
+	; dest2
+	vpshufb	xgft2_hi, xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpand	xgft2_hi, xgft2_hi, xtmpl3
+	vpxor	xd2, xd2, xgft2_hi
+
+	; dest3
+	vpshufb	xgft3_hi, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpand	xgft3_hi, xgft3_hi, xtmpl3
+	vpxor	xd3, xd3, xgft3_hi
+
+	XSTR	[dest1+tmp], xd1
+	XSTR	[dest2+tmp], xd2
+	XSTR	[dest3+tmp], xd3
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_3vect_mad_avx, 02,  01,  0207
--- a/erasure_code/gf_3vect_mad_avx2.asm
+++ b/erasure_code/gf_3vect_mad_avx2.asm
@ -0,0 +1,317 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg0.w ecx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	vmovdqa	[rsp+16*3],xmm9
+	vmovdqa	[rsp+16*4],xmm10
+	vmovdqa	[rsp+16*5],xmm11
+	vmovdqa	[rsp+16*6],xmm12
+	vmovdqa	[rsp+16*7],xmm13
+	vmovdqa	[rsp+16*8],xmm14
+	vmovdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	vmovdqa	xmm9, [rsp+16*3]
+	vmovdqa	xmm10, [rsp+16*4]
+	vmovdqa	xmm11, [rsp+16*5]
+	vmovdqa	xmm12, [rsp+16*6]
+	vmovdqa	xmm13, [rsp+16*7]
+	vmovdqa	xmm14, [rsp+16*8]
+	vmovdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+ %endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_3vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft1_hi  ymm13
+%define xgft2_lo  ymm12
+%define xgft3_lo  ymm11
+
+%define x0      ymm0
+%define xtmpa   ymm1
+%define xtmph1  ymm2
+%define xtmpl1  ymm3
+%define xtmph2  ymm4
+%define xtmpl2  ymm5
+%define xtmpl2x xmm5
+%define xtmph3  ymm6
+%define xtmpl3  ymm7
+%define xtmpl3x xmm7
+%define xd1     ymm8
+%define xd2     ymm9
+%define xd3     ymm10
+
+align 16
+global gf_3vect_mad_avx2:function
+func(gf_3vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+					;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
+
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+					; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+					; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest3, [dest1+2*PS]	; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+	XLDR	xd2, [dest2+pos]		;Get next dest vector
+	XLDR	xd3, [dest3+pos]		;Get next dest vector
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xtmpl2, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+
+	vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xtmpl3, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xtmph3		;xd3 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan32:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp.b, 0x1f
+	vpinsrb	xtmpl2x, xtmpl2x, tmp.w, 0
+	vpbroadcastb xtmpl2, xtmpl2x	;Construct mask 0x1f1f1f...
+
+	mov	tmp, len		;Overlapped offset length-32
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph3, [constip32]	;Load const of i + 32
+	vpinsrb	xtmpl3x, xtmpl3x, len.w, 15
+	vinserti128	xtmpl3, xtmpl3, xtmpl3x, 1 ;swapped to xtmpl3x | xtmpl3x
+	vpshufb	xtmpl3, xtmpl3, xtmpl2	;Broadcast len to all bytes. xtmpl2=0x1f1f1f...
+	vpcmpgtb	xtmpl3, xtmpl3, xtmph3
+
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+
+	vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl3
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xgft2_lo	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmpl3
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xgft3_lo	;GF add high and low partials
+	vpand	xtmph3, xtmph3, xtmpl3
+	vpxor	xd3, xd3, xtmph3		;xd3 += partial
+
+	XSTR	[dest1+tmp], xd1
+	XSTR	[dest2+tmp], xd2
+	XSTR	[dest3+tmp], xd3
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 32
+constip32:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+	ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
+
+;;;       func              core, ver, snum
+slversion gf_3vect_mad_avx2, 04,  01,  0208
--- a/erasure_code/gf_3vect_mad_sse.asm
+++ b/erasure_code/gf_3vect_mad_sse.asm
@ -0,0 +1,298 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_3vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft1_lo  xmm14
+%define xgft1_hi  xmm13
+%define xgft2_lo  xmm12
+%define xgft2_hi  xmm11
+%define xgft3_lo  xmm10
+%define xgft3_hi  xmm9
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xd1     xmm8
+%define xd2     xtmpl1
+%define xd3     xtmph1
+
+align 16
+global gf_3vect_mad_sse:function
+func(gf_3vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+
+	movdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xgft1_hi, [tmp+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xgft2_hi, [tmp+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xgft3_lo, [tmp+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xgft3_hi, [tmp+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest3, [dest1+2*PS]	; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+	movdqa	xtmph1, xgft1_hi	;Reload const array registers
+	movdqa	xtmpl1, xgft1_lo
+	movdqa	xtmph2, xgft2_hi	;Reload const array registers
+	movdqa	xtmpl2, xgft2_lo
+	movdqa	xtmph3, xgft3_hi	;Reload const array registers
+	movdqa	xtmpl3, xgft3_lo
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd2, [dest2+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+pos]	;reuse xtmph1. Get next dest vector
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	; dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	XSTR	[dest1+pos], xd1	;Store result
+	XSTR	[dest2+pos], xd2	;Store result
+	XSTR	[dest3+pos], xd3	;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len		;Overlapped offset length-16
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	movdqa	xtmph3, [constip16]	;Load const of i + 16
+	pinsrb	xtmpl3, len.w, 15
+	pshufb	xtmpl3, xmask0f		;Broadcast len to all bytes
+	pcmpgtb	xtmpl3, xtmph3
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	pand	xgft1_hi, xtmpl3
+	pxor	xd1, xgft1_hi
+
+	; dest2
+	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	pand	xgft2_hi, xtmpl3
+	pxor	xd2, xgft2_hi
+
+	; dest3
+	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	pand	xgft3_hi, xtmpl3
+	pxor	xd3, xgft3_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result
+	XSTR	[dest2+tmp], xd2	;Store result
+	XSTR	[dest3+tmp], xd3	;Store result
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_3vect_mad_sse, 00,  01,  0206
--- a/erasure_code/gf_4vect_dot_prod_avx.asm
+++ b/erasure_code/gf_4vect_dot_prod_avx.asm
@ -0,0 +1,441 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  9*16 + 7*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r13,  9*16 + 1*8
+	save_reg	r14,  9*16 + 2*8
+	save_reg	r15,  9*16 + 3*8
+	save_reg	rdi,  9*16 + 4*8
+	save_reg	rsi,  9*16 + 5*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	vmovdqa	xmm12, [rsp + 6*16]
+	vmovdqa	xmm13, [rsp + 7*16]
+	vmovdqa	xmm14, [rsp + 8*16]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r13,  [rsp + 9*16 + 1*8]
+	mov	r14,  [rsp + 9*16 + 2*8]
+	mov	r15,  [rsp + 9*16 + 3*8]
+	mov	rdi,  [rsp + 9*16 + 4*8]
+	mov	rsi,  [rsp + 9*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	var2
+;;;	var3
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS     4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans	 ecx
+ %define trans2  esi
+ %define arg0	 trans		;trans and trans2 are for the variables in stack
+ %define arg0_m	 arg(0)
+ %define arg1	 ebx
+ %define arg2	 arg2_m
+ %define arg2_m	 arg(2)
+ %define arg3	 trans
+ %define arg3_m	 arg(3)
+ %define arg4	 trans
+ %define arg4_m	 arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp2	 edi
+ %define tmp3	 trans2
+ %define tmp3_m	 var(0)
+ %define tmp4	 trans2
+ %define tmp4_m	 var(1)
+ %define tmp5	 trans2
+ %define tmp5_m	 var(2)
+ %define tmp6	 trans2
+ %define tmp6_m	 var(3)
+ %define return	 eax
+ %macro SLDR 2				;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*4		;4 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*4		;4 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define len    arg0
+%define vec    arg1
+%define mul_array arg2
+%define	src    arg3
+%define dest1  arg4
+%define ptr    arg5
+%define vec_i  tmp2
+%define dest2  tmp3
+%define dest3  tmp4
+%define dest4  tmp5
+%define vskip3 tmp6
+%define pos    return
+
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+	%define  dest4_m tmp5_m
+	%define  vskip3_m tmp6_m
+ %endif
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+%ifidn PS,8				; 64-bit code
+ default rel
+  [bits 64]
+%endif
+
+
+section .text
+
+%ifidn PS,8				;64-bit code
+ %define xmask0f   xmm14
+ %define xgft1_lo  xmm13
+ %define xgft1_hi  xmm12
+ %define xgft2_lo  xmm11
+ %define xgft2_hi  xmm10
+ %define xgft3_lo  xmm9
+ %define xgft3_hi  xmm8
+ %define xgft4_lo  xmm7
+ %define xgft4_hi  xmm6
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+ %define xp4    xmm5
+%else
+ %define xmm_trans xmm7			;reuse xmask0f and xgft1_lo
+ %define xmask0f   xmm_trans
+ %define xgft1_lo  xmm_trans
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+ %define xgft4_lo  xgft1_lo
+ %define xgft4_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+ %define xp4    xmm5
+%endif
+align 16
+global gf_4vect_dot_prod_avx:function
+func(gf_4vect_dot_prod_avx)
+	FUNC_SAVE
+	SLDR	len, len_m
+	sub	len, 16
+	SSTR	len_m, len
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	vskip3, vec
+	imul	vskip3, 96
+	SSTR	vskip3_m, vskip3
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
+	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
+	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
+	mov	dest4, [dest1+3*PS]
+	SSTR	dest4_m, dest4
+	mov	dest1, [dest1]
+	SSTR	dest1_m, dest1
+
+.loop16:
+	vpxor	xp1, xp1
+	vpxor	xp2, xp2
+	vpxor	xp3, xp3
+	vpxor	xp4, xp4
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect:
+	SLDR 	src, src_m
+	mov	ptr, [src+vec_i]
+
+ %ifidn PS,8				;64-bit code
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	vmovdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	vmovdqu	xgft3_hi, [tmp+vec*(64/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	vmovdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+
+	XLDR	x0, 	[ptr+pos]	;Get next source vector
+	add	tmp, 32
+	add	vec_i, PS
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+ %else					;32-bit code
+	XLDR	x0, [ptr+pos]		;Get next source vector
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	vmovdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %endif
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp1, xgft1_hi		;xp1 += partial
+
+ %ifidn PS,4				;32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+ %endif
+	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	xp2, xgft2_hi		;xp2 += partial
+
+ %ifidn PS,4				;32-bit code
+	sal	vec, 1
+	vmovdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	vmovdqu	xgft3_hi, [tmp+vec*(32/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	sar 	vec, 1
+ %endif
+	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpxor	xp3, xgft3_hi		;xp3 += partial
+
+ %ifidn PS,4				;32-bit code
+	SLDR	vskip3, vskip3_m
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	vmovdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
+	vpshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_lo	;GF add high and low partials
+	vpxor	xp4, xgft4_hi		;xp4 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
+	XSTR	[dest3+pos], xp3
+	SLDR	dest4, dest4_m
+	XSTR	[dest4+pos], xp4
+
+	SLDR	len, len_m
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop16		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func                  core, ver, snum
+slversion gf_4vect_dot_prod_avx, 02,  05,  0193
--- a/erasure_code/gf_4vect_dot_prod_avx2.asm
+++ b/erasure_code/gf_4vect_dot_prod_avx2.asm
@ -0,0 +1,460 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  9*16 + 7*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqa	[rsp + 0*16], xmm6
+	vmovdqa	[rsp + 1*16], xmm7
+	vmovdqa	[rsp + 2*16], xmm8
+	vmovdqa	[rsp + 3*16], xmm9
+	vmovdqa	[rsp + 4*16], xmm10
+	vmovdqa	[rsp + 5*16], xmm11
+	vmovdqa	[rsp + 6*16], xmm12
+	vmovdqa	[rsp + 7*16], xmm13
+	vmovdqa	[rsp + 8*16], xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r13,  9*16 + 1*8
+	save_reg	r14,  9*16 + 2*8
+	save_reg	r15,  9*16 + 3*8
+	save_reg	rdi,  9*16 + 4*8
+	save_reg	rsi,  9*16 + 5*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	vmovdqa	xmm12, [rsp + 6*16]
+	vmovdqa	xmm13, [rsp + 7*16]
+	vmovdqa	xmm14, [rsp + 8*16]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r13,  [rsp + 9*16 + 1*8]
+	mov	r14,  [rsp + 9*16 + 2*8]
+	mov	r15,  [rsp + 9*16 + 3*8]
+	mov	rdi,  [rsp + 9*16 + 4*8]
+	mov	rsi,  [rsp + 9*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	var2
+;;;	var3
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS     4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans	 ecx
+ %define trans2  esi
+ %define arg0	 trans		;trans and trans2 are for the variables in stack
+ %define arg0_m	 arg(0)
+ %define arg1	 ebx
+ %define arg2	 arg2_m
+ %define arg2_m	 arg(2)
+ %define arg3	 trans
+ %define arg3_m	 arg(3)
+ %define arg4	 trans
+ %define arg4_m	 arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp.w   edx
+ %define tmp.b   dl
+ %define tmp2	 edi
+ %define tmp3	 trans2
+ %define tmp3_m	 var(0)
+ %define tmp4	 trans2
+ %define tmp4_m	 var(1)
+ %define tmp5	 trans2
+ %define tmp5_m	 var(2)
+ %define tmp6	 trans2
+ %define tmp6_m	 var(3)
+ %define return	 eax
+ %macro SLDR 2				;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*4		;4 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*4		;4 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define len    arg0
+%define vec    arg1
+%define mul_array arg2
+%define	src    arg3
+%define dest1  arg4
+%define ptr    arg5
+%define vec_i  tmp2
+%define dest2  tmp3
+%define dest3  tmp4
+%define dest4  tmp5
+%define vskip3 tmp6
+%define pos    return
+
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+	%define  dest4_m tmp5_m
+	%define  vskip3_m tmp6_m
+ %endif
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+%ifidn PS,8				;64-bit code
+ default rel
+  [bits 64]
+%endif
+
+
+section .text
+
+%ifidn PS,8				;64-bit code
+ %define xmask0f   ymm14
+ %define xmask0fx  xmm14
+ %define xgft1_lo  ymm13
+ %define xgft1_hi  ymm12
+ %define xgft2_lo  ymm11
+ %define xgft2_hi  ymm10
+ %define xgft3_lo  ymm9
+ %define xgft3_hi  ymm8
+ %define xgft4_lo  ymm7
+ %define xgft4_hi  ymm6
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+ %define xp3    ymm4
+ %define xp4    ymm5
+%else
+ %define ymm_trans ymm7			;reuse xmask0f and xgft1_hi
+ %define xmask0f   ymm_trans
+ %define xmask0fx  xmm7
+ %define xgft1_lo  ymm6
+ %define xgft1_hi  ymm_trans
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+ %define xgft4_lo  xgft1_lo
+ %define xgft4_hi  xgft1_hi
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+ %define xp3    ymm4
+ %define xp4    ymm5
+%endif
+align 16
+global gf_4vect_dot_prod_avx2:function
+func(gf_4vect_dot_prod_avx2)
+	FUNC_SAVE
+	SLDR	len, len_m
+	sub	len, 32
+	SSTR	len_m, len
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+	mov	vskip3, vec
+	imul	vskip3, 96
+	SSTR	vskip3_m, vskip3
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
+	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
+	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
+	mov	dest4, [dest1+3*PS]
+	SSTR	dest4_m, dest4
+	mov	dest1, [dest1]
+	SSTR	dest1_m, dest1
+
+.loop32:
+	vpxor	xp1, xp1
+	vpxor	xp2, xp2
+	vpxor	xp3, xp3
+	vpxor	xp4, xp4
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect:
+	SLDR	src, src_m
+	mov	ptr, [src+vec_i]
+	XLDR	x0, [ptr+pos]		;Get next source vector
+
+	add	vec_i, PS
+ %ifidn PS,8				;64-bit code
+	vpand	xgft4_lo, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xgft4_lo, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xgft4_lo, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+						;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	add	tmp, 32
+ %else					;32-bit code
+	mov	cl, 0x0f		;use ecx as a temp variable
+	vpinsrb	xmask0fx, xmask0fx, ecx, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	vpand	xgft4_lo, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xgft4_lo, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xgft4_lo, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+						;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+ %endif
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp1, xgft1_hi		;xp1 += partial
+
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+ %endif
+	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	xp2, xgft2_hi		;xp2 += partial
+
+ %ifidn PS,4				; 32-bit code
+	sal     vec, 1
+	vmovdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	sar	vec, 1
+ %endif
+	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpxor	xp3, xgft3_hi		;xp3 += partial
+
+ %ifidn PS,4				; 32-bit code
+	SLDR	vskip3, vskip3_m
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     DX{00}, Dx{10}, ..., Dx{f0}
+	vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	add	tmp, 32
+ %endif
+	vpshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_lo	;GF add high and low partials
+	vpxor	xp4, xgft4_hi		;xp4 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
+	XSTR	[dest3+pos], xp3
+	SLDR	dest4, dest4_m
+	XSTR	[dest4+pos], xp4
+
+	SLDR	len, len_m
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-32
+	jmp	.loop32		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func                   core, ver, snum
+slversion gf_4vect_dot_prod_avx2, 04,  05,  0198
--- a/erasure_code/gf_4vect_dot_prod_sse.asm
+++ b/erasure_code/gf_4vect_dot_prod_sse.asm
@ -0,0 +1,443 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  9*16 + 7*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r13,  9*16 + 1*8
+	save_reg	r14,  9*16 + 2*8
+	save_reg	r15,  9*16 + 3*8
+	save_reg	rdi,  9*16 + 4*8
+	save_reg	rsi,  9*16 + 5*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r13,  [rsp + 9*16 + 1*8]
+	mov	r14,  [rsp + 9*16 + 2*8]
+	mov	r15,  [rsp + 9*16 + 3*8]
+	mov	rdi,  [rsp + 9*16 + 4*8]
+	mov	rsi,  [rsp + 9*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	var2
+;;;	var3
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS     4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans	 ecx
+ %define trans2  esi
+ %define arg0	 trans		;trans and trans2 are for the variables in stack
+ %define arg0_m	 arg(0)
+ %define arg1	 ebx
+ %define arg2	 arg2_m
+ %define arg2_m	 arg(2)
+ %define arg3	 trans
+ %define arg3_m	 arg(3)
+ %define arg4	 trans
+ %define arg4_m	 arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp2	 edi
+ %define tmp3	 trans2
+ %define tmp3_m	 var(0)
+ %define tmp4	 trans2
+ %define tmp4_m	 var(1)
+ %define tmp5	 trans2
+ %define tmp5_m	 var(2)
+ %define tmp6	 trans2
+ %define tmp6_m	 var(3)
+ %define return	 eax
+ %macro SLDR 2				;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*4		;4 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*4		;4 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define len    arg0
+%define vec    arg1
+%define mul_array arg2
+%define	src    arg3
+%define dest1  arg4
+%define ptr    arg5
+%define vec_i  tmp2
+%define dest2  tmp3
+%define dest3  tmp4
+%define dest4  tmp5
+%define vskip3 tmp6
+%define pos    return
+
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+	%define  dest4_m tmp5_m
+	%define  vskip3_m tmp6_m
+ %endif
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+%ifidn PS,8				; 64-bit code
+ default rel
+  [bits 64]
+%endif
+
+
+section .text
+
+%ifidn PS,8				;64-bit code
+ %define xmask0f   xmm14
+ %define xgft1_lo  xmm2
+ %define xgft1_hi  xmm3
+ %define xgft2_lo  xmm11
+ %define xgft2_hi  xmm4
+ %define xgft3_lo  xmm9
+ %define xgft3_hi  xmm5
+ %define xgft4_lo  xmm7
+ %define xgft4_hi  xmm6
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm8
+ %define xp2    xmm10
+ %define xp3    xmm12
+ %define xp4    xmm13
+%else
+ %define xmm_trans xmm7			;reuse xmask0f and xgft1_lo
+ %define xmask0f   xmm_trans
+ %define xgft1_lo  xmm_trans
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+ %define xgft4_lo  xgft1_lo
+ %define xgft4_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+ %define xp4    xmm5
+%endif
+align 16
+global gf_4vect_dot_prod_sse:function
+func(gf_4vect_dot_prod_sse)
+	FUNC_SAVE
+	SLDR	len, len_m
+	sub	len, 16
+	SSTR	len_m, len
+	jl	.return_fail
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	vskip3,  vec
+	imul	vskip3,  96
+	SSTR	vskip3_m, vskip3
+	sal	vec, 	 LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, 	 dest1_m
+	mov	dest2, 	 [dest1+PS]
+	SSTR	dest2_m, dest2
+	mov	dest3, 	 [dest1+2*PS]
+	SSTR	dest3_m, dest3
+	mov	dest4, 	 [dest1+3*PS]
+	SSTR	dest4_m, dest4
+	mov	dest1, 	 [dest1]
+	SSTR	dest1_m, dest1
+
+.loop16:
+	pxor	xp1, xp1
+	pxor	xp2, xp2
+	pxor	xp3, xp3
+	pxor	xp4, xp4
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect:
+	SLDR 	src, src_m
+	mov	ptr, [src+vec_i]
+
+ %ifidn PS,8				;64-bit code
+	movdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	movdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	movdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	movdqu	xgft3_hi, [tmp+vec*(64/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	movdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	movdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+
+	XLDR	x0, 	[ptr+pos]	;Get next source vector
+	add	tmp, 	32
+	add	vec_i, 	PS
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, 	xmask0f		;Mask low src nibble in bits 4-0
+ %else					;32-bit code
+ 	XLDR	x0, 	 [ptr+pos]	;Get next source vector
+ 	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+ 	movdqa	xtmpa, 	x0		;Keep unshifted copy of src
+	psraw	x0, 	4		;Shift to put high nibble into bits 4-0
+	pand	x0, 	xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	movdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	movdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %endif
+
+	pshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	pxor	xp1, xgft1_hi		;xp1 += partial
+
+ %ifidn PS,4				;32-bit code
+	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+ %endif
+	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	pxor	xp2, xgft2_hi		;xp2 += partial
+
+ %ifidn PS,4				;32-bit code
+	sal	vec, 1
+	movdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	movdqu	xgft3_hi, [tmp+vec*(32/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	sar 	vec, 1
+ %endif
+	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	pxor	xp3, xgft3_hi		;xp3 += partial
+
+ %ifidn PS,4				;32-bit code
+	SLDR	vskip3, vskip3_m
+	movdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	movdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo	;GF add high and low partials
+	pxor	xp4, xgft4_hi		;xp4 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
+	XSTR	[dest3+pos], xp3
+	SLDR	dest4, dest4_m
+	XSTR	[dest4+pos], xp4
+
+	SLDR	len, len_m
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop16		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func                  core, ver, snum
+slversion gf_4vect_dot_prod_sse, 00,  06,  0064
--- a/erasure_code/gf_4vect_dot_prod_sse_perf.c
+++ b/erasure_code/gf_4vect_dot_prod_sse_perf.c
@ -0,0 +1,281 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "test.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_4vect_dot_prod_sse
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   40000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS 1000
+#  endif
+# endif
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j;
+	void *buf;
+	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
+	u8 g4[TEST_SOURCES], g_tbls[4 * TEST_SOURCES * 32], *buffs[TEST_SOURCES];
+	u8 *dest1, *dest2, *dest3, *dest4, *dest_ref1, *dest_ref2, *dest_ref3;
+	u8 *dest_ref4, *dest_ptrs[4];
+	struct perf start, stop;
+
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest4 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref4 = buf;
+
+	dest_ptrs[0] = dest1;
+	dest_ptrs[1] = dest2;
+	dest_ptrs[2] = dest3;
+	dest_ptrs[3] = dest4;
+
+	// Performance test
+	for (i = 0; i < TEST_SOURCES; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	memset(dest1, 0, TEST_LEN);
+	memset(dest2, 0, TEST_LEN);
+	memset(dest3, 0, TEST_LEN);
+	memset(dest4, 0, TEST_LEN);
+	memset(dest_ref1, 0, TEST_LEN);
+	memset(dest_ref2, 0, TEST_LEN);
+	memset(dest_ref3, 0, TEST_LEN);
+	memset(dest_ref4, 0, TEST_LEN);
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		g1[i] = rand();
+		g2[i] = rand();
+		g3[i] = rand();
+		g4[i] = rand();
+	}
+
+	for (j = 0; j < TEST_SOURCES; j++) {
+		gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+		gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+		gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+		gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+	}
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
+			      dest_ref2);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
+			      dest_ref3);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
+			      dest_ref4);
+
+#ifdef DO_REF_PERF
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS / 100; i++) {
+		for (j = 0; j < TEST_SOURCES; j++) {
+			gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+			gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+		}
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      buffs, dest_ref2);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
+				      buffs, dest_ref3);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
+				      buffs, dest_ref4);
+	}
+	perf_stop(&stop);
+	printf("gf_4vect_dot_prod_base" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 4) * i);
+#endif
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		for (j = 0; j < TEST_SOURCES; j++) {
+			gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+			gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+		}
+
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 4) * i);
+
+	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref1, 25);
+		printf("dprod_dut:");
+		dump(dest1, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref2, 25);
+		printf("dprod_dut:");
+		dump(dest2, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref3, 25);
+		printf("dprod_dut:");
+		dump(dest3, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref4, 25);
+		printf("dprod_dut:");
+		dump(dest4, 25);
+		return -1;
+	}
+
+	printf("pass perf check\n");
+	return 0;
+
+}
--- a/erasure_code/gf_4vect_dot_prod_sse_test.c
+++ b/erasure_code/gf_4vect_dot_prod_sse_test.c
@ -0,0 +1,692 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_4vect_dot_prod_sse
+#endif
+#ifndef TEST_MIN_SIZE
+# define TEST_MIN_SIZE  16
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+#define TEST_MEM  TEST_SIZE
+#define TEST_LOOPS 10000
+#define TEST_TYPE_STR ""
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  16
+#endif
+#ifndef RANDOMS
+# define RANDOMS 20
+#endif
+
+#ifdef EC_ALIGNED_ADDR
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 0
+# define LEN_ALIGN_CHK_B 0	// 0 for aligned only
+#else
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 32
+# define LEN_ALIGN_CHK_B 32	// 0 for aligned only
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, srcs;
+	void *buf;
+	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
+	u8 g4[TEST_SOURCES], g_tbls[4 * TEST_SOURCES * 32], *buffs[TEST_SOURCES];
+	u8 *dest1, *dest2, *dest3, *dest4, *dest_ref1, *dest_ref2, *dest_ref3;
+	u8 *dest_ref4, *dest_ptrs[4];
+
+	int align, size;
+	unsigned char *efence_buffs[TEST_SOURCES];
+	unsigned int offset;
+	u8 *ubuffs[TEST_SOURCES];
+	u8 *udest_ptrs[4];
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest4 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref4 = buf;
+
+	dest_ptrs[0] = dest1;
+	dest_ptrs[1] = dest2;
+	dest_ptrs[2] = dest3;
+	dest_ptrs[3] = dest4;
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	memset(dest1, 0, TEST_LEN);
+	memset(dest2, 0, TEST_LEN);
+	memset(dest3, 0, TEST_LEN);
+	memset(dest4, 0, TEST_LEN);
+	memset(dest_ref1, 0, TEST_LEN);
+	memset(dest_ref2, 0, TEST_LEN);
+	memset(dest_ref3, 0, TEST_LEN);
+	memset(dest_ref4, 0, TEST_LEN);
+	memset(g1, 2, TEST_SOURCES);
+	memset(g2, 1, TEST_SOURCES);
+	memset(g3, 7, TEST_SOURCES);
+	memset(g4, 3, TEST_SOURCES);
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+		gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
+		gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
+		gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]);
+	}
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
+			      dest_ref2);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
+			      dest_ref3);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
+			      dest_ref4);
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref1, 25);
+		printf("dprod_dut:");
+		dump(dest1, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref2, 25);
+		printf("dprod_dut:");
+		dump(dest2, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref3, 25);
+		printf("dprod_dut:");
+		dump(dest3, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref4, 25);
+		printf("dprod_dut:");
+		dump(dest4, 25);
+		return -1;
+	}
+
+	putchar('.');
+
+	// Rand data test
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+			g4[i] = rand();
+		}
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      buffs, dest_ref2);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
+				      buffs, dest_ref3);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
+				      buffs, dest_ref4);
+
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+		if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(dest1, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(dest2, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, 25);
+			printf("dprod_dut:");
+			dump(dest3, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref4, 25);
+			printf("dprod_dut:");
+			dump(dest4, 25);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Rand data test with varied parameters
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
+			for (i = 0; i < srcs; i++)
+				for (j = 0; j < TEST_LEN; j++)
+					buffs[i][j] = rand();
+
+			for (i = 0; i < srcs; i++) {
+				g1[i] = rand();
+				g2[i] = rand();
+				g3[i] = rand();
+				g4[i] = rand();
+			}
+
+			for (i = 0; i < srcs; i++) {
+				gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+				gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+				gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
+				gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
+			}
+
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
+					      dest_ref2);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
+					      dest_ref3);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs,
+					      dest_ref4);
+
+			FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
+
+			if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test1 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref1, 25);
+				printf("dprod_dut:");
+				dump(dest1, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test2 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref2, 25);
+				printf("dprod_dut:");
+				dump(dest2, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test3 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref3, 25);
+				printf("dprod_dut:");
+				dump(dest3, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test4 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref4, 25);
+				printf("dprod_dut:");
+				dump(dest4, 25);
+				return -1;
+			}
+
+			putchar('.');
+		}
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 32;
+	for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)	// Line up TEST_SIZE from end
+			efence_buffs[i] = buffs[i] + TEST_LEN - size;
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+			g4[i] = rand();
+		}
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      efence_buffs, dest_ref2);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
+				      efence_buffs, dest_ref3);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
+				      efence_buffs, dest_ref4);
+
+		FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
+
+		if (0 != memcmp(dest_ref1, dest1, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, align);
+			printf("dprod_dut:");
+			dump(dest1, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref2, dest2, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, align);
+			printf("dprod_dut:");
+			dump(dest2, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref3, dest3, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, align);
+			printf("dprod_dut:");
+			dump(dest3, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref4, dest4, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref4, align);
+			printf("dprod_dut:");
+			dump(dest4, align);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test rand ptr alignment if available
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
+		srcs = rand() % TEST_SOURCES;
+		if (srcs == 0)
+			continue;
+
+		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
+		// Add random offsets
+		for (i = 0; i < srcs; i++)
+			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		memset(dest1, 0, TEST_LEN);	// zero pad to check write-over
+		memset(dest2, 0, TEST_LEN);
+		memset(dest3, 0, TEST_LEN);
+		memset(dest4, 0, TEST_LEN);
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				ubuffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+			g4[i] = rand();
+		}
+
+		for (i = 0; i < srcs; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
+
+		if (memcmp(dest_ref1, udest_ptrs[0], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[0], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref2, udest_ptrs[1], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[1], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref3, udest_ptrs[2], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[2], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref4, udest_ptrs[3], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref4, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[3], 25);
+			return -1;
+		}
+		// Confirm that padding around dests is unchanged
+		memset(dest_ref1, 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
+		offset = udest_ptrs[0] - dest1;
+
+		if (memcmp(dest1, dest_ref1, offset)) {
+			printf("Fail rand ualign pad1 start\n");
+			return -1;
+		}
+		if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad1 end\n");
+			printf("size=%d offset=%d srcs=%d\n", size, offset, srcs);
+			return -1;
+		}
+
+		offset = udest_ptrs[1] - dest2;
+		if (memcmp(dest2, dest_ref1, offset)) {
+			printf("Fail rand ualign pad2 start\n");
+			return -1;
+		}
+		if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad2 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[2] - dest3;
+		if (memcmp(dest3, dest_ref1, offset)) {
+			printf("Fail rand ualign pad3 start\n");
+			return -1;
+		}
+		if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad3 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[3] - dest4;
+		if (memcmp(dest4, dest_ref1, offset)) {
+			printf("Fail rand ualign pad4 start\n");
+			return -1;
+		}
+		if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad4 end\n");
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test all size alignment
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 32;
+
+	for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
+		srcs = TEST_SOURCES;
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+			g4[i] = rand();
+		}
+
+		for (i = 0; i < srcs; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
+
+		if (memcmp(dest_ref1, dest_ptrs[0], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[0], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref2, dest_ptrs[1], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[1], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref3, dest_ptrs[2], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[2], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref4, dest_ptrs[3], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref4, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[3], 25);
+			return -1;
+		}
+	}
+
+	printf("Pass\n");
+	return 0;
+
+}
--- a/erasure_code/gf_4vect_mad_avx.asm
+++ b/erasure_code/gf_4vect_mad_avx.asm
@ -0,0 +1,336 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r15,  10*16 + 2*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r15,  [rsp + 10*16 + 2*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_4vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 tmp2
+%define dest4 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft3_hi xmm14
+%define xgft4_hi xmm13
+%define xgft4_lo xmm12
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xtmph4  xmm8
+%define xtmpl4  xmm9
+%define xd1     xmm10
+%define xd2     xmm11
+%define xd3     xtmph1
+%define xd4     xtmpl1
+
+align 16
+global gf_4vect_mad_avx:function
+func(gf_4vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	mov	tmp, vec
+
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+
+	sal	tmp, 6			;Multiply by 64
+	vmovdqu	xgft3_hi, [tmp3+tmp+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	sal	vec, 5			;Multiply by 32
+	add	tmp, vec
+	vmovdqu	xgft4_lo, [tmp3+tmp]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+	vmovdqu	xgft4_hi, [tmp3+tmp+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+
+	mov	dest2, [dest1+PS]		; reuse mul_array
+	mov	dest3, [dest1+2*PS]
+	mov	dest4, [dest1+3*PS]		; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpl1, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1
+
+	XLDR	xd3, [dest3+pos]	;Reuse xtmph1, Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Reuse xtmpl1, Get next dest vector
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2
+
+	; dest3
+	vpshufb	xtmph3, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3
+
+	; dest4
+	vpshufb	xtmph4, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl4, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph4, xtmph4, xtmpl4		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph4
+
+	XSTR	[dest1+pos], xd1	;Store result
+	XSTR	[dest2+pos], xd2	;Store result
+	XSTR	[dest3+pos], xd3	;Store result
+	XSTR	[dest4+pos], xd4	;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+
+	mov	tmp, len	;Overlapped offset length-16
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xtmph4, [dest3+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmpl4, [constip16]	;Load const of i + 16
+	vpinsrb	xtmph3, xtmph3, len.w, 15
+	vpshufb	xtmph3, xtmph3, xmask0f		;Broadcast len to all bytes
+	vpcmpgtb	xtmph3, xtmph3, xtmpl4
+
+	XLDR	xtmpl4, [dest4+tmp]	;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpl1, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmph3
+	vpxor	xd1, xd1, xtmph1
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpl2, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmph3
+	vpxor	xd2, xd2, xtmph2
+
+	; dest3
+	vpshufb	xgft3_hi, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_hi, xtmpl3	;GF add high and low partials
+	vpand	xgft3_hi, xgft3_hi, xtmph3
+	vpxor	xtmph4, xtmph4, xgft3_hi
+
+	; dest4
+	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_hi, xgft4_lo	;GF add high and low partials
+	vpand	xgft4_hi, xgft4_hi, xtmph3
+	vpxor	xtmpl4, xtmpl4, xgft4_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result
+	XSTR	[dest2+tmp], xd2	;Store result
+	XSTR	[dest3+tmp], xtmph4	;Store result
+	XSTR	[dest4+tmp], xtmpl4	;Store result
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_4vect_mad_avx, 02,  01,  020a
--- a/erasure_code/gf_4vect_mad_avx2.asm
+++ b/erasure_code/gf_4vect_mad_avx2.asm
@ -0,0 +1,342 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+
+;;; gf_4vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 vec
+%define dest4 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft2_lo  ymm13
+%define xgft3_lo  ymm12
+%define xgft4_lo  ymm11
+
+%define x0      ymm0
+%define xtmpa   ymm1
+%define xtmpl   ymm2
+%define xtmplx  xmm2
+%define xtmph1  ymm3
+%define xtmph1x xmm3
+%define xtmph2  ymm4
+%define xtmph3  ymm5
+%define xtmph4  ymm6
+%define xd1     ymm7
+%define xd2     ymm8
+%define xd3     ymm9
+%define xd4     ymm10
+
+align 16
+global gf_4vect_mad_avx2:function
+func(gf_4vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5			;Multiply by 32
+	lea	tmp, [mul_array + vec_i]
+
+	vmovdqu	xgft1_lo, [tmp]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+					; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+					; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+					; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	add	tmp, vec
+	vmovdqu	xgft4_lo, [tmp+2*vec]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+					; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+
+	mov	dest2, [dest1+PS]		; reuse mul_array
+	mov	dest3, [dest1+2*PS]		; reuse vec
+	mov	dest4, [dest1+3*PS]		; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+	XLDR	xd3, [dest3+pos]	;Get next dest vector
+	XLDR	xd4, [dest4+pos]	;reuse xtmpl1. Get next dest vector
+
+	vpand	xtmpl, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vperm2i128 xtmpa, xtmpl, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmpl, x0, 0x12		;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3		;xd3 += partial
+
+	; dest4
+	vpshufb	xtmph4, xtmph4, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph4, xtmph4, xtmpl		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph4		;xd4 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+	XSTR	[dest4+pos], xd4
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan32:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp.b, 0x1f
+	vpinsrb	xtmph1x, xtmph1x, tmp.w, 0
+	vpbroadcastb xtmph1, xtmph1x	;Construct mask 0x1f1f1f...
+
+	mov	tmp, len		;Overlapped offset length-32
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph2, [constip32]	;Load const of i + 32
+	vpinsrb	xtmplx, xtmplx, len.w, 15
+	vinserti128	xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
+	vpshufb	xtmpl, xtmpl, xtmph1	;Broadcast len to all bytes. xtmph1=0x1f1f1f...
+	vpcmpgtb	xtmpl, xtmpl, xtmph2
+
+	vpand	xtmph1, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vperm2i128 xtmpa, xtmph1, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmph1, x0, 0x12		;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xgft1_lo	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xgft2_lo	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmpl
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xgft3_lo	;GF add high and low partials
+	vpand	xtmph3, xtmph3, xtmpl
+	vpxor	xd3, xd3, xtmph3		;xd3 += partial
+
+	; dest4
+	vpshufb	xtmph4, xtmph4, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph4, xtmph4, xgft4_lo	;GF add high and low partials
+	vpand	xtmph4, xtmph4, xtmpl
+	vpxor	xd4, xd4, xtmph4		;xd4 += partial
+
+	XSTR	[dest1+tmp], xd1
+	XSTR	[dest2+tmp], xd2
+	XSTR	[dest3+tmp], xd3
+	XSTR	[dest4+tmp], xd4
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+align 32
+constip32:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+	ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
+
+;;;       func              core, ver, snum
+slversion gf_4vect_mad_avx2, 04,  01,  020b
--- a/erasure_code/gf_4vect_mad_sse.asm
+++ b/erasure_code/gf_4vect_mad_sse.asm
@ -0,0 +1,342 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r15,  10*16 + 2*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r15,  [rsp + 10*16 + 2*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 tmp2
+%define dest4 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft3_hi xmm14
+%define xgft4_hi xmm13
+%define xgft4_lo xmm12
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xtmph4  xmm8
+%define xtmpl4  xmm9
+%define xd1     xmm10
+%define xd2     xmm11
+%define xd3     xtmph1
+%define xd4     xtmpl1
+
+align 16
+global gf_4vect_mad_sse:function
+func(gf_4vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	tmp, vec
+
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+
+	sal	tmp, 6			;Multiply by 64
+
+	movdqu	xgft3_hi, [tmp3+tmp+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	sal	vec, 5			;Multiply by 32
+	add	tmp, vec
+	movdqu	xgft4_lo, [tmp3+tmp]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+	movdqu	xgft4_hi, [tmp3+tmp+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+
+	mov	dest2, [dest1+PS]		; reuse mul_array
+	mov	dest3, [dest1+2*PS]
+	mov	dest4, [dest1+3*PS]		; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+
+	movdqa	xtmph3, xgft3_hi
+	movdqa	xtmpl4, xgft4_lo
+	movdqa	xtmph4, xgft4_hi
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd3, [dest3+pos]	;Reuse xtmph1, Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Reuse xtmpl1, Get next dest vector
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	; dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	; dest4
+	pshufb	xtmph4, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl4, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph4, xtmpl4		;GF add high and low partials
+	pxor	xd4, xtmph4
+
+	XSTR	[dest1+pos], xd1	;Store result
+	XSTR	[dest2+pos], xd2	;Store result
+	XSTR	[dest3+pos], xd3	;Store result
+	XSTR	[dest4+pos], xd4	;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len	;Overlapped offset length-16
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xtmph4, [dest3+tmp]	;Reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	movdqa	xtmpl4, [constip16]	;Load const of i + 16
+	pinsrb	xtmph3, len.w, 15
+	pshufb	xtmph3, xmask0f		;Broadcast len to all bytes
+	pcmpgtb	xtmph3, xtmpl4
+
+	XLDR	xtmpl4, [dest4+tmp]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pand	xtmph1, xtmph3
+	pxor	xd1, xtmph1
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pand	xtmph2, xtmph3
+	pxor	xd2, xtmph2
+
+	; dest3
+	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft3_hi, xtmpl3	;GF add high and low partials
+	pand	xgft3_hi, xtmph3
+	pxor	xtmph4, xgft3_hi
+
+	; dest4
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo	;GF add high and low partials
+	pand	xgft4_hi, xtmph3
+	pxor	xtmpl4, xgft4_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result
+	XSTR	[dest2+tmp], xd2	;Store result
+	XSTR	[dest3+tmp], xtmph4	;Store result
+	XSTR	[dest4+tmp], xtmpl4	;Store result
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_4vect_mad_sse, 00,  01,  0209
--- a/erasure_code/gf_5vect_dot_prod_avx.asm
+++ b/erasure_code/gf_5vect_dot_prod_avx.asm
@ -0,0 +1,303 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+ %define PS 8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	vmovdqa	xmm12, [rsp + 6*16]
+	vmovdqa	xmm13, [rsp + 7*16]
+	vmovdqa	xmm14, [rsp + 8*16]
+	vmovdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define len    arg0
+%define vec    arg1
+%define mul_array arg2
+%define	src    arg3
+%define dest   arg4
+%define ptr    arg5
+%define vec_i  tmp2
+%define dest1  tmp3
+%define dest2  tmp4
+%define vskip1 tmp5
+%define vskip3 tmp6
+%define pos    return
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   xmm15
+%define xgft1_lo  xmm14
+%define xgft1_hi  xmm13
+%define xgft2_lo  xmm12
+%define xgft2_hi  xmm11
+%define xgft3_lo  xmm10
+%define xgft3_hi  xmm9
+%define xgft4_lo  xmm8
+%define xgft4_hi  xmm7
+
+
+%define x0     xmm0
+%define xtmpa  xmm1
+%define xp1    xmm2
+%define xp2    xmm3
+%define xp3    xmm4
+%define xp4    xmm5
+%define xp5    xmm6
+
+align 16
+global gf_5vect_dot_prod_avx:function
+func(gf_5vect_dot_prod_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	vskip1, vec
+	imul	vskip1, 32
+	mov	vskip3, vec
+	imul	vskip3, 96
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	mov	dest1, [dest]
+	mov	dest2, [dest+PS]
+
+
+.loop16:
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+	vpxor	xp1, xp1
+	vpxor	xp2, xp2
+	vpxor	xp3, xp3
+	vpxor	xp4, xp4
+	vpxor	xp5, xp5
+
+
+.next_vect:
+	mov	ptr, [src+vec_i]
+	add	vec_i, PS
+	XLDR	x0, [ptr+pos]		;Get next source vector
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	vmovdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vskip1*1]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vskip1*1+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+vskip1*2]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	vmovdqu	xgft3_hi, [tmp+vskip1*2+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	vmovdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp1, xgft1_hi		;xp1 += partial
+
+	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	xp2, xgft2_hi		;xp2 += partial
+
+	vmovdqu	xgft1_lo, [tmp+vskip1*4]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+	vmovdqu	xgft1_hi, [tmp+vskip1*4+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	add	tmp, 32
+
+	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpxor	xp3, xgft3_hi		;xp3 += partial
+
+	vpshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_lo	;GF add high and low partials
+	vpxor	xp4, xgft4_hi		;xp4 += partial
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp5, xgft1_hi		;xp5 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	mov	tmp, [dest+2*PS]
+	mov	ptr, [dest+3*PS]
+	mov	vec_i, [dest+4*PS]
+
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	XSTR	[tmp+pos], xp3
+	XSTR	[ptr+pos], xp4
+	XSTR	[vec_i+pos], xp5
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop16		;Do one more overlap pass
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func                  core, ver, snum
+slversion gf_5vect_dot_prod_avx, 02,  04,  0194
--- a/erasure_code/gf_5vect_dot_prod_avx2.asm
+++ b/erasure_code/gf_5vect_dot_prod_avx2.asm
@ -0,0 +1,315 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+ %define PS 8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqa	[rsp + 0*16], xmm6
+	vmovdqa	[rsp + 1*16], xmm7
+	vmovdqa	[rsp + 2*16], xmm8
+	vmovdqa	[rsp + 3*16], xmm9
+	vmovdqa	[rsp + 4*16], xmm10
+	vmovdqa	[rsp + 5*16], xmm11
+	vmovdqa	[rsp + 6*16], xmm12
+	vmovdqa	[rsp + 7*16], xmm13
+	vmovdqa	[rsp + 8*16], xmm14
+	vmovdqa	[rsp + 9*16], xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	vmovdqa	xmm12, [rsp + 6*16]
+	vmovdqa	xmm13, [rsp + 7*16]
+	vmovdqa	xmm14, [rsp + 8*16]
+	vmovdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define len    arg0
+%define vec    arg1
+%define mul_array arg2
+%define	src    arg3
+%define dest   arg4
+%define ptr    arg5
+%define vec_i  tmp2
+%define dest1  tmp3
+%define dest2  tmp4
+%define vskip1 tmp5
+%define vskip3 tmp6
+%define pos    return
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft1_hi  ymm13
+%define xgft2_lo  ymm12
+%define xgft2_hi  ymm11
+%define xgft3_lo  ymm10
+%define xgft3_hi  ymm9
+%define xgft4_lo  ymm8
+%define xgft4_hi  ymm7
+
+
+%define x0     ymm0
+%define xtmpa  ymm1
+%define xp1    ymm2
+%define xp2    ymm3
+%define xp3    ymm4
+%define xp4    ymm5
+%define xp5    ymm6
+
+align 16
+global gf_5vect_dot_prod_avx2:function
+func(gf_5vect_dot_prod_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+	mov	vskip1, vec
+	imul	vskip1, 32
+	mov	vskip3, vec
+	imul	vskip3, 96
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	mov	dest1, [dest]
+	mov	dest2, [dest+PS]
+
+
+.loop32:
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+	vpxor	xp1, xp1
+	vpxor	xp2, xp2
+	vpxor	xp3, xp3
+	vpxor	xp4, xp4
+	vpxor	xp5, xp5
+
+
+.next_vect:
+	mov	ptr, [src+vec_i]
+	XLDR	x0, [ptr+pos]		;Get next source vector
+	add	vec_i, PS
+
+	vpand	xgft4_lo, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xgft4_lo, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xgft4_lo, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+						;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vskip1*1]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+vskip1*2]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp1, xgft1_hi		;xp1 += partial
+
+	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	xp2, xgft2_hi		;xp2 += partial
+
+	vmovdqu	xgft1_lo, [tmp+vskip1*4]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+						;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	add	tmp, 32
+
+	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpxor	xp3, xgft3_hi		;xp3 += partial
+
+	vpshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_lo	;GF add high and low partials
+	vpxor	xp4, xgft4_hi		;xp4 += partial
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp5, xgft1_hi		;xp5 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	mov	tmp, [dest+2*PS]
+	mov	ptr, [dest+3*PS]
+	mov	vec_i, [dest+4*PS]
+
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	XSTR	[tmp+pos], xp3
+	XSTR	[ptr+pos], xp4
+	XSTR	[vec_i+pos], xp5
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop32		;Do one more overlap pass
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func                  core, ver, snum
+slversion gf_5vect_dot_prod_avx2, 04,  04,  0199
--- a/erasure_code/gf_5vect_dot_prod_sse.asm
+++ b/erasure_code/gf_5vect_dot_prod_sse.asm
@ -0,0 +1,304 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+ %define PS 8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define len    arg0
+%define vec    arg1
+%define mul_array arg2
+%define	src    arg3
+%define dest   arg4
+%define ptr    arg5
+%define vec_i  tmp2
+%define dest1  tmp3
+%define dest2  tmp4
+%define vskip1 tmp5
+%define vskip3 tmp6
+%define pos    return
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   xmm15
+%define xgft1_lo  xmm2
+%define xgft1_hi  xmm3
+%define xgft2_lo  xmm4
+%define xgft2_hi  xmm5
+%define xgft3_lo  xmm10
+%define xgft3_hi  xmm6
+%define xgft4_lo  xmm8
+%define xgft4_hi  xmm7
+
+
+%define x0     xmm0
+%define xtmpa  xmm1
+%define xp1    xmm9
+%define xp2    xmm11
+%define xp3    xmm12
+%define xp4    xmm13
+%define xp5    xmm14
+
+align 16
+global gf_5vect_dot_prod_sse:function
+func(gf_5vect_dot_prod_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	vskip1, vec
+	imul	vskip1, 32
+	mov	vskip3, vec
+	imul	vskip3, 96
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	mov	dest1, [dest]
+	mov	dest2, [dest+PS]
+
+
+.loop16:
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+	pxor	xp1, xp1
+	pxor	xp2, xp2
+	pxor	xp3, xp3
+	pxor	xp4, xp4
+	pxor	xp5, xp5
+
+
+.next_vect:
+	mov	ptr, [src+vec_i]
+	add	vec_i, PS
+	XLDR	x0, [ptr+pos]		;Get next source vector
+
+	movdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	movdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	movdqu	xgft2_lo, [tmp+vskip1*1]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vskip1*1+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	movdqu	xgft3_lo, [tmp+vskip1*2]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	movdqu	xgft3_hi, [tmp+vskip1*2+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	movdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	movdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	pshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	pxor	xp1, xgft1_hi		;xp1 += partial
+
+	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	pxor	xp2, xgft2_hi		;xp2 += partial
+
+	movdqu	xgft1_lo, [tmp+vskip1*4]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+	movdqu	xgft1_hi, [tmp+vskip1*4+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	add	tmp, 32
+
+	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	pxor	xp3, xgft3_hi		;xp3 += partial
+
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo	;GF add high and low partials
+	pxor	xp4, xgft4_hi		;xp4 += partial
+
+	pshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	pxor	xp5, xgft1_hi		;xp5 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	mov	tmp, [dest+2*PS]
+	mov	ptr, [dest+3*PS]
+	mov	vec_i, [dest+4*PS]
+
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	XSTR	[tmp+pos], xp3
+	XSTR	[ptr+pos], xp4
+	XSTR	[vec_i+pos], xp5
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop16		;Do one more overlap pass
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func                  core, ver, snum
+slversion gf_5vect_dot_prod_sse, 00,  05,  0065
--- a/erasure_code/gf_5vect_dot_prod_sse_perf.c
+++ b/erasure_code/gf_5vect_dot_prod_sse_perf.c
@ -0,0 +1,319 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "test.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_5vect_dot_prod_sse
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   40000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS 1000
+#  endif
+# endif
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j;
+	void *buf;
+	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
+	u8 g4[TEST_SOURCES], g5[TEST_SOURCES], *g_tbls, *buffs[TEST_SOURCES];
+	u8 *dest1, *dest2, *dest3, *dest4, *dest5, *dest_ref1, *dest_ref2;
+	u8 *dest_ref3, *dest_ref4, *dest_ref5, *dest_ptrs[5];
+	struct perf start, stop;
+
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 16, 6 * TEST_SOURCES * 32)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	g_tbls = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest4 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest5 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref4 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref5 = buf;
+
+	dest_ptrs[0] = dest1;
+	dest_ptrs[1] = dest2;
+	dest_ptrs[2] = dest3;
+	dest_ptrs[3] = dest4;
+	dest_ptrs[4] = dest5;
+
+	// Performance test
+	for (i = 0; i < TEST_SOURCES; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	memset(dest1, 0, TEST_LEN);
+	memset(dest2, 0, TEST_LEN);
+	memset(dest3, 0, TEST_LEN);
+	memset(dest4, 0, TEST_LEN);
+	memset(dest5, 0, TEST_LEN);
+	memset(dest_ref1, 0, TEST_LEN);
+	memset(dest_ref2, 0, TEST_LEN);
+	memset(dest_ref3, 0, TEST_LEN);
+	memset(dest_ref4, 0, TEST_LEN);
+	memset(dest_ref5, 0, TEST_LEN);
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		g1[i] = rand();
+		g2[i] = rand();
+		g3[i] = rand();
+		g4[i] = rand();
+		g5[i] = rand();
+	}
+
+	for (j = 0; j < TEST_SOURCES; j++) {
+		gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+		gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+		gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+		gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+		gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
+	}
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
+			      dest_ref2);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
+			      dest_ref3);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
+			      dest_ref4);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
+			      dest_ref5);
+
+#ifdef DO_REF_PERF
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS / 20; i++) {
+		for (j = 0; j < TEST_SOURCES; j++) {
+			gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+			gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
+		}
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      buffs, dest_ref2);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
+				      buffs, dest_ref3);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
+				      buffs, dest_ref4);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
+				      buffs, dest_ref5);
+	}
+	perf_stop(&stop);
+	printf("gf_5vect_dot_prod_base" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 5) * i);
+#endif
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		for (j = 0; j < TEST_SOURCES; j++) {
+			gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+			gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
+		}
+
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 5) * i);
+
+	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref1, 25);
+		printf("dprod_dut:");
+		dump(dest1, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref2, 25);
+		printf("dprod_dut:");
+		dump(dest2, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref3, 25);
+		printf("dprod_dut:");
+		dump(dest3, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test4\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref4, 25);
+		printf("dprod_dut:");
+		dump(dest4, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test5\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref5, 25);
+		printf("dprod_dut:");
+		dump(dest5, 25);
+		return -1;
+	}
+
+	printf("pass perf check\n");
+	return 0;
+
+}
--- a/erasure_code/gf_5vect_dot_prod_sse_test.c
+++ b/erasure_code/gf_5vect_dot_prod_sse_test.c
@ -0,0 +1,805 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_5vect_dot_prod_sse
+#endif
+#ifndef TEST_MIN_SIZE
+# define TEST_MIN_SIZE  16
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+#define TEST_MEM  TEST_SIZE
+#define TEST_LOOPS 20000
+#define TEST_TYPE_STR ""
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  16
+#endif
+#ifndef RANDOMS
+# define RANDOMS 20
+#endif
+
+#ifdef EC_ALIGNED_ADDR
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 0
+# define LEN_ALIGN_CHK_B 0	// 0 for aligned only
+#else
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 32
+# define LEN_ALIGN_CHK_B 32	// 0 for aligned only
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, srcs;
+	void *buf;
+	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
+	u8 g4[TEST_SOURCES], g5[TEST_SOURCES], *g_tbls;
+	u8 *dest1, *dest2, *dest3, *dest4, *dest5, *buffs[TEST_SOURCES];
+	u8 *dest_ref1, *dest_ref2, *dest_ref3, *dest_ref4, *dest_ref5;
+	u8 *dest_ptrs[5];
+
+	int align, size;
+	unsigned char *efence_buffs[TEST_SOURCES];
+	unsigned int offset;
+	u8 *ubuffs[TEST_SOURCES];
+	u8 *udest_ptrs[5];
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 16, 2 * (6 * TEST_SOURCES * 32))) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	g_tbls = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest4 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest5 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref4 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref5 = buf;
+
+	dest_ptrs[0] = dest1;
+	dest_ptrs[1] = dest2;
+	dest_ptrs[2] = dest3;
+	dest_ptrs[3] = dest4;
+	dest_ptrs[4] = dest5;
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	memset(dest1, 0, TEST_LEN);
+	memset(dest2, 0, TEST_LEN);
+	memset(dest3, 0, TEST_LEN);
+	memset(dest4, 0, TEST_LEN);
+	memset(dest5, 0, TEST_LEN);
+	memset(dest_ref1, 0, TEST_LEN);
+	memset(dest_ref2, 0, TEST_LEN);
+	memset(dest_ref3, 0, TEST_LEN);
+	memset(dest_ref4, 0, TEST_LEN);
+	memset(dest_ref5, 0, TEST_LEN);
+	memset(g1, 2, TEST_SOURCES);
+	memset(g2, 1, TEST_SOURCES);
+	memset(g3, 7, TEST_SOURCES);
+	memset(g4, 9, TEST_SOURCES);
+	memset(g5, 4, TEST_SOURCES);
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+		gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
+		gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
+		gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]);
+		gf_vect_mul_init(g5[i], &g_tbls[128 * TEST_SOURCES + i * 32]);
+	}
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
+			      dest_ref2);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
+			      dest_ref3);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
+			      dest_ref4);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
+			      dest_ref5);
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref1, 25);
+		printf("dprod_dut:");
+		dump(dest1, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref2, 25);
+		printf("dprod_dut:");
+		dump(dest2, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref3, 25);
+		printf("dprod_dut:");
+		dump(dest3, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref4, 25);
+		printf("dprod_dut:");
+		dump(dest4, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test5\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref5, 25);
+		printf("dprod_dut:");
+		dump(dest5, 25);
+		return -1;
+	}
+	putchar('.');
+
+	// Rand data test
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+			g4[i] = rand();
+			g5[i] = rand();
+		}
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      buffs, dest_ref2);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
+				      buffs, dest_ref3);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
+				      buffs, dest_ref4);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
+				      buffs, dest_ref5);
+
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+		if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(dest1, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(dest2, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, 25);
+			printf("dprod_dut:");
+			dump(dest3, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref4, 25);
+			printf("dprod_dut:");
+			dump(dest4, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref5, 25);
+			printf("dprod_dut:");
+			dump(dest5, 25);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Rand data test with varied parameters
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
+			for (i = 0; i < srcs; i++)
+				for (j = 0; j < TEST_LEN; j++)
+					buffs[i][j] = rand();
+
+			for (i = 0; i < srcs; i++) {
+				g1[i] = rand();
+				g2[i] = rand();
+				g3[i] = rand();
+				g4[i] = rand();
+				g5[i] = rand();
+			}
+
+			for (i = 0; i < srcs; i++) {
+				gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+				gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+				gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
+				gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
+				gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
+			}
+
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
+					      dest_ref2);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
+					      dest_ref3);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs,
+					      dest_ref4);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[128 * srcs], buffs,
+					      dest_ref5);
+
+			FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
+
+			if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test1 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref1, 25);
+				printf("dprod_dut:");
+				dump(dest1, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test2 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref2, 25);
+				printf("dprod_dut:");
+				dump(dest2, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test3 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref3, 25);
+				printf("dprod_dut:");
+				dump(dest3, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test4 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref4, 25);
+				printf("dprod_dut:");
+				dump(dest4, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test5 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref5, 25);
+				printf("dprod_dut:");
+				dump(dest5, 25);
+				return -1;
+			}
+
+			putchar('.');
+		}
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+	for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)	// Line up TEST_SIZE from end
+			efence_buffs[i] = buffs[i] + TEST_LEN - size;
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+			g4[i] = rand();
+			g5[i] = rand();
+		}
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      efence_buffs, dest_ref2);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
+				      efence_buffs, dest_ref3);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
+				      efence_buffs, dest_ref4);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
+				      efence_buffs, dest_ref5);
+
+		FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
+
+		if (0 != memcmp(dest_ref1, dest1, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, align);
+			printf("dprod_dut:");
+			dump(dest1, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref2, dest2, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, align);
+			printf("dprod_dut:");
+			dump(dest2, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref3, dest3, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, align);
+			printf("dprod_dut:");
+			dump(dest3, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref4, dest4, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref4, align);
+			printf("dprod_dut:");
+			dump(dest4, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref5, dest5, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref5, align);
+			printf("dprod_dut:");
+			dump(dest5, align);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test rand ptr alignment if available
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
+		srcs = rand() % TEST_SOURCES;
+		if (srcs == 0)
+			continue;
+
+		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
+		// Add random offsets
+		for (i = 0; i < srcs; i++)
+			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[4] = dest5 + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		memset(dest1, 0, TEST_LEN);	// zero pad to check write-over
+		memset(dest2, 0, TEST_LEN);
+		memset(dest3, 0, TEST_LEN);
+		memset(dest4, 0, TEST_LEN);
+		memset(dest5, 0, TEST_LEN);
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				ubuffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+			g4[i] = rand();
+			g5[i] = rand();
+		}
+
+		for (i = 0; i < srcs; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], ubuffs, dest_ref5);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
+
+		if (memcmp(dest_ref1, udest_ptrs[0], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[0], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref2, udest_ptrs[1], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[1], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref3, udest_ptrs[2], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[2], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref4, udest_ptrs[3], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref4, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[3], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref5, udest_ptrs[4], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref5, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[4], 25);
+			return -1;
+		}
+		// Confirm that padding around dests is unchanged
+		memset(dest_ref1, 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
+		offset = udest_ptrs[0] - dest1;
+
+		if (memcmp(dest1, dest_ref1, offset)) {
+			printf("Fail rand ualign pad1 start\n");
+			return -1;
+		}
+		if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad1 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[1] - dest2;
+		if (memcmp(dest2, dest_ref1, offset)) {
+			printf("Fail rand ualign pad2 start\n");
+			return -1;
+		}
+		if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad2 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[2] - dest3;
+		if (memcmp(dest3, dest_ref1, offset)) {
+			printf("Fail rand ualign pad3 start\n");
+			return -1;
+		}
+		if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad3 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[3] - dest4;
+		if (memcmp(dest4, dest_ref1, offset)) {
+			printf("Fail rand ualign pad4 start\n");
+			return -1;
+		}
+		if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad4 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[4] - dest5;
+		if (memcmp(dest5, dest_ref1, offset)) {
+			printf("Fail rand ualign pad5 start\n");
+			return -1;
+		}
+		if (memcmp(dest5 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad5 end\n");
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test all size alignment
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+
+	for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
+		srcs = TEST_SOURCES;
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+			g4[i] = rand();
+			g5[i] = rand();
+		}
+
+		for (i = 0; i < srcs; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], buffs, dest_ref5);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
+
+		if (memcmp(dest_ref1, dest_ptrs[0], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[0], 25);
+
+			return -1;
+		}
+		if (memcmp(dest_ref2, dest_ptrs[1], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[1], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref3, dest_ptrs[2], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[2], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref4, dest_ptrs[3], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref4, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[3], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref5, dest_ptrs[4], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref5, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[4], 25);
+			return -1;
+		}
+	}
+
+	printf("Pass\n");
+	return 0;
+
+}
--- a/erasure_code/gf_5vect_mad_avx.asm
+++ b/erasure_code/gf_5vect_mad_avx.asm
@ -0,0 +1,365 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define tmp4   r14
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 5*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define tmp4   r13
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp4
+%define dest3 mul_array
+%define dest4 tmp2
+%define dest5 vec_i
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft5_hi xmm14
+%define xgft4_lo xmm13
+%define xgft4_hi xmm12
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xtmph5  xmm8
+%define xtmpl5  xmm9
+%define xd1     xmm10
+%define xd2     xmm11
+%define xd3     xtmpl1
+%define xd4     xtmph1
+%define xd5     xtmpl2
+
+
+align 16
+global gf_5vect_mad_avx:function
+func(gf_5vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	tmp, vec
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+	sal	tmp, 6			;Multiply by 64
+	vmovdqu	xgft5_hi, [tmp3+2*tmp+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	sal	vec, 5			;Multiply by 32
+	add	tmp, vec
+	vmovdqu	xgft4_hi, [tmp3+tmp+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+	vmovdqu	xgft4_lo, [tmp3+tmp]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+
+	mov	dest3, [dest1+2*PS]	; reuse mul_array
+	mov	dest4, [dest1+3*PS]
+	mov	dest5, [dest1+4*PS]	; reuse vec_i
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xtmpl5, [tmp3+4*vec]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpl1, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1
+
+	XLDR	xd3, [dest3+pos]	;Reuse xtmpl1, Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Reuse xtmph1, Get next dest vector
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2
+
+	XLDR	xd5, [dest5+pos]	;Reuse xtmpl2. Get next dest vector
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3
+
+	; dest4
+	vpshufb	xtmph2, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl3		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph2
+
+	; dest5
+	vpshufb	xtmph5, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl5, xtmpl5, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph5, xtmph5, xtmpl5	;GF add high and low partials
+	vpxor	xd5, xd5, xtmph5
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+	XSTR	[dest4+pos], xd4	;Store result into dest4
+	XSTR	[dest5+pos], xd5	;Store result into dest5
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len	;Overlapped offset length-16
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph1, [constip16]	;Load const of i + 16
+	vpinsrb	xtmph5, len.w, 15
+	vpshufb	xtmph5, xmask0f		;Broadcast len to all bytes
+	vpcmpgtb	xtmph5, xtmph5, xtmph1
+
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xtmpl5, [tmp3+4*vec]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpl1, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmph5
+	vpxor	xd1, xd1, xtmph1
+
+	XLDR	xd3, [dest3+tmp]	;Reuse xtmpl1, Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Reuse xtmph1, Get next dest vector
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmph5
+	vpxor	xd2, xd2, xtmph2
+
+	XLDR	xd5, [dest5+tmp]	;Reuse xtmpl2. Get next dest vector
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpand	xtmph3, xtmph3, xtmph5
+	vpxor	xd3, xd3, xtmph3
+
+	; dest4
+	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_hi, xgft4_lo		;GF add high and low partials
+	vpand	xgft4_hi, xgft4_hi, xtmph5
+	vpxor	xd4, xd4, xgft4_hi
+
+	; dest5
+	vpshufb	xgft5_hi, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl5, xtmpl5, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft5_hi, xgft5_hi, xtmpl5	;GF add high and low partials
+	vpand	xgft5_hi, xgft5_hi, xtmph5
+	vpxor	xd5, xd5, xgft5_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+	XSTR	[dest4+tmp], xd4	;Store result into dest4
+	XSTR	[dest5+tmp], xd5	;Store result into dest5
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_5vect_mad_avx, 02,  01,  020d
--- a/erasure_code/gf_5vect_mad_avx2.asm
+++ b/erasure_code/gf_5vect_mad_avx2.asm
@ -0,0 +1,363 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp2
+%define dest3 mul_array
+%define dest4 vec
+%define dest5 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft2_lo  ymm13
+%define xgft3_lo  ymm12
+%define xgft4_lo  ymm11
+%define xgft5_lo  ymm10
+
+%define x0      ymm0
+%define xtmpa   ymm1
+%define xtmpl   ymm2
+%define xtmplx  xmm2
+%define xtmph1  ymm3
+%define xtmph1x xmm3
+%define xtmph2  ymm4
+%define xd1     ymm5
+%define xd2     ymm6
+%define xd3     ymm7
+%define xd4     ymm8
+%define xd5     ymm9
+
+align 16
+global gf_5vect_mad_avx2:function
+func(gf_5vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5			;Multiply by 32
+	lea	tmp, [mul_array + vec_i]
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+						;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]		;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft5_lo, [tmp+4*vec]		;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+						;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	add	tmp, vec
+	vmovdqu	xgft4_lo, [tmp+2*vec]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+
+	mov	dest3, [dest1+2*PS]	; reuse mul_array
+	mov	dest4, [dest1+3*PS]	; reuse vec
+	mov	dest5, [dest1+4*PS]	; reuse vec_i
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+.loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+	XLDR	xd3, [dest3+pos]	;Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Get next dest vector
+	XLDR	xd5, [dest5+pos]	;Get next dest vector
+
+	vpand	xtmpl, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xtmpl, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmpl, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	; dest3
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph1		;xd3 += partial
+
+	vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
+	; dest4
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph2		;xd4 += partial
+
+	; dest5
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl		;GF add high and low partials
+	vpxor	xd5, xd5, xtmph1		;xd5 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+	XSTR	[dest4+pos], xd4
+	XSTR	[dest5+pos], xd5
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan32:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp.b, 0x1f
+	vpinsrb	xtmph1x, xtmph1x, tmp.w, 0
+	vpbroadcastb xtmph1, xtmph1x	;Construct mask 0x1f1f1f...
+
+	mov	tmp, len		;Overlapped offset length-32
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Get next dest vector
+	XLDR	xd5, [dest5+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph2, [constip32]	;Load const of i + 32
+	vpinsrb	xtmplx, xtmplx, len.w, 15
+	vinserti128	xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
+	vpshufb	xtmpl, xtmpl, xtmph1	;Broadcast len to all bytes. xtmph1=0x1f1f1f...
+	vpcmpgtb	xtmpl, xtmpl, xtmph2
+
+	vpand	xtmph1, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xtmph1, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmph1, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xgft1_lo	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xgft2_lo	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmpl
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	; dest3
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xgft3_lo	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl
+	vpxor	xd3, xd3, xtmph1		;xd3 += partial
+
+	vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
+	; dest4
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xgft4_lo	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmpl
+	vpxor	xd4, xd4, xtmph2		;xd4 += partial
+
+	; dest5
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xgft5_lo, xgft5_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xgft5_lo	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl
+	vpxor	xd5, xd5, xtmph1		;xd5 += partial
+
+	XSTR	[dest1+tmp], xd1
+	XSTR	[dest2+tmp], xd2
+	XSTR	[dest3+tmp], xd3
+	XSTR	[dest4+tmp], xd4
+	XSTR	[dest5+tmp], xd5
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+align 32
+constip32:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+	ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
+
+;;;       func             core, ver, snum
+slversion gf_5vect_mad_avx2, 04,  01,  020e
--- a/erasure_code/gf_5vect_mad_sse.asm
+++ b/erasure_code/gf_5vect_mad_sse.asm
@ -0,0 +1,373 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define tmp4   r14
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 5*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define tmp4   r13
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_5vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp4
+%define dest3 mul_array
+%define dest4 tmp2
+%define dest5 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft5_hi xmm14
+%define xgft4_lo xmm13
+%define xgft4_hi xmm12
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xtmph5  xmm8
+%define xtmpl5  xmm9
+%define xd1     xmm10
+%define xd2     xmm11
+%define xd3     xtmpl1
+%define xd4     xtmph1
+%define xd5     xtmpl2
+
+
+align 16
+global gf_5vect_mad_sse:function
+func(gf_5vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	tmp, vec
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+	sal	tmp, 6			;Multiply by 64
+	movdqu	xgft5_hi, [tmp3+2*tmp+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	sal	vec, 5			;Multiply by 32
+	add	tmp, vec
+	movdqu	xgft4_hi, [tmp3+tmp+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+	movdqu	xgft4_lo, [tmp3+tmp]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+
+	mov	dest3, [dest1+2*PS]	; reuse mul_array
+	mov	dest4, [dest1+3*PS]
+	mov	dest5, [dest1+4*PS]	; reuse vec_i
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xtmpl5, [tmp3+4*vec]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+	movdqa	xtmph5, xgft5_hi		;Reload const array registers
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd3, [dest3+pos]	;Reuse xtmpl1, Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Reuse xtmph1. Get next dest vector
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	XLDR	xd5, [dest5+pos]	;Reuse xtmpl2. Get next dest vector
+
+	; dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	movdqa	xtmph2, xgft4_hi		;Reload const array registers
+	movdqa	xtmpl3, xgft4_lo		;Reload const array registers
+
+	; dest5
+	pshufb	xtmph5, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl5, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph5, xtmpl5		;GF add high and low partials
+	pxor	xd5, xtmph5
+
+	; dest4
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl3		;GF add high and low partials
+	pxor	xd4, xtmph2
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+	XSTR	[dest4+pos], xd4	;Store result into dest4
+	XSTR	[dest5+pos], xd5	;Store result into dest5
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len	;Overlapped offset length-16
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	sub	len, pos
+
+	movdqa	xtmpl1, [constip16]	;Load const of i + 16
+	pinsrb	xtmph5, len.w, 15
+	pshufb	xtmph5, xmask0f		;Broadcast len to all bytes
+	pcmpgtb	xtmph5, xtmpl1
+
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xtmpl5, [tmp3+4*vec]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pand	xtmph1, xtmph5
+	pxor	xd1, xtmph1
+
+	XLDR	xd3, [dest3+tmp]	;Reuse xtmpl1, Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Reuse xtmph1. Get next dest vector
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pand	xtmph2, xtmph5
+	pxor	xd2, xtmph2
+
+	XLDR	xd5, [dest5+tmp]	;Reuse xtmpl2. Get next dest vector
+
+	; dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pand	xtmph3, xtmph5
+	pxor	xd3, xtmph3
+
+	; dest4
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo		;GF add high and low partials
+	pand	xgft4_hi, xtmph5
+	pxor	xd4, xgft4_hi
+
+	; dest5
+	pshufb	xgft5_hi, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl5, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft5_hi, xtmpl5		;GF add high and low partials
+	pand	xgft5_hi, xtmph5
+	pxor	xd5, xgft5_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+	XSTR	[dest4+tmp], xd4	;Store result into dest4
+	XSTR	[dest5+tmp], xd5	;Store result into dest5
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_5vect_mad_sse, 00,  01,  020c
--- a/erasure_code/gf_6vect_dot_prod_avx.asm
+++ b/erasure_code/gf_6vect_dot_prod_avx.asm
@ -0,0 +1,315 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+ %define PS 8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	vmovdqa	xmm12, [rsp + 6*16]
+	vmovdqa	xmm13, [rsp + 7*16]
+	vmovdqa	xmm14, [rsp + 8*16]
+	vmovdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define len    arg0
+%define vec    arg1
+%define mul_array arg2
+%define	src    arg3
+%define dest   arg4
+%define ptr    arg5
+%define vec_i  tmp2
+%define dest1  tmp3
+%define dest2  tmp4
+%define vskip1 tmp5
+%define vskip3 tmp6
+%define pos    return
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   xmm15
+%define xgft1_lo  xmm14
+%define xgft1_hi  xmm13
+%define xgft2_lo  xmm12
+%define xgft2_hi  xmm11
+%define xgft3_lo  xmm10
+%define xgft3_hi  xmm9
+%define x0     xmm0
+%define xtmpa  xmm1
+%define xp1    xmm2
+%define xp2    xmm3
+%define xp3    xmm4
+%define xp4    xmm5
+%define xp5    xmm6
+%define xp6    xmm7
+
+align 16
+global gf_6vect_dot_prod_avx:function
+func(gf_6vect_dot_prod_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	vskip1, vec
+	imul	vskip1, 32
+	mov	vskip3, vec
+	imul	vskip3, 96
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	mov	dest1, [dest]
+	mov	dest2, [dest+PS]
+
+
+.loop16:
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+	vpxor	xp1, xp1
+	vpxor	xp2, xp2
+	vpxor	xp3, xp3
+	vpxor	xp4, xp4
+	vpxor	xp5, xp5
+	vpxor	xp6, xp6
+
+.next_vect:
+	mov	ptr, [src+vec_i]
+	add	vec_i, PS
+	XLDR	x0, [ptr+pos]		;Get next source vector
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	vmovdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vskip1*1]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vskip1*1+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+vskip1*2]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	vmovdqu	xgft3_hi, [tmp+vskip1*2+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	lea	ptr, [vskip1 + vskip1*4]	;ptr = vskip5
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp1, xgft1_hi		;xp1 += partial
+
+	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	xp2, xgft2_hi		;xp2 += partial
+
+	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpxor	xp3, xgft3_hi		;xp3 += partial
+
+
+	vmovdqu	xgft1_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	vmovdqu	xgft1_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+	vmovdqu	xgft2_lo, [tmp+vskip1*4]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+	vmovdqu	xgft2_hi, [tmp+vskip1*4+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	vmovdqu	xgft3_lo, [tmp+ptr]		;Load array Fx{00}, Fx{01}, ..., Fx{0f}
+	vmovdqu	xgft3_hi, [tmp+ptr+16]		;     "     Fx{00}, Fx{10}, ..., Fx{f0}
+	add	tmp, 32
+
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp4, xgft1_hi		;xp4 += partial
+
+	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	xp5, xgft2_hi		;xp5 += partial
+
+	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpxor	xp6, xgft3_hi		;xp6 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+
+	mov	tmp, [dest+2*PS]
+	mov	ptr, [dest+3*PS]
+	mov	vec_i, [dest+4*PS]
+
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	XSTR	[tmp+pos], xp3
+	mov	tmp, [dest+5*PS]
+	XSTR	[ptr+pos], xp4
+	XSTR	[vec_i+pos], xp5
+	XSTR	[tmp+pos], xp6
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop16		;Do one more overlap pass
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func                  core, ver, snum
+slversion gf_6vect_dot_prod_avx, 02,  04,  0195
--- a/erasure_code/gf_6vect_dot_prod_avx2.asm
+++ b/erasure_code/gf_6vect_dot_prod_avx2.asm
@ -0,0 +1,326 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+ %define PS 8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	vmovdqa	[rsp + 0*16], xmm6
+	vmovdqa	[rsp + 1*16], xmm7
+	vmovdqa	[rsp + 2*16], xmm8
+	vmovdqa	[rsp + 3*16], xmm9
+	vmovdqa	[rsp + 4*16], xmm10
+	vmovdqa	[rsp + 5*16], xmm11
+	vmovdqa	[rsp + 6*16], xmm12
+	vmovdqa	[rsp + 7*16], xmm13
+	vmovdqa	[rsp + 8*16], xmm14
+	vmovdqa	[rsp + 9*16], xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm8, [rsp + 2*16]
+	vmovdqa	xmm9, [rsp + 3*16]
+	vmovdqa	xmm10, [rsp + 4*16]
+	vmovdqa	xmm11, [rsp + 5*16]
+	vmovdqa	xmm12, [rsp + 6*16]
+	vmovdqa	xmm13, [rsp + 7*16]
+	vmovdqa	xmm14, [rsp + 8*16]
+	vmovdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define len    arg0
+%define vec    arg1
+%define mul_array arg2
+%define	src    arg3
+%define dest   arg4
+%define ptr    arg5
+%define vec_i  tmp2
+%define dest1  tmp3
+%define dest2  tmp4
+%define vskip1 tmp5
+%define vskip3 tmp6
+%define pos    return
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft1_hi  ymm13
+%define xgft2_lo  ymm12
+%define xgft2_hi  ymm11
+%define xgft3_lo  ymm10
+%define xgft3_hi  ymm9
+%define x0     ymm0
+%define xtmpa  ymm1
+%define xp1    ymm2
+%define xp2    ymm3
+%define xp3    ymm4
+%define xp4    ymm5
+%define xp5    ymm6
+%define xp6    ymm7
+
+align 16
+global gf_6vect_dot_prod_avx2:function
+func(gf_6vect_dot_prod_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+	mov	vskip1, vec
+	imul	vskip1, 32
+	mov	vskip3, vec
+	imul	vskip3, 96
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	mov	dest1, [dest]
+	mov	dest2, [dest+PS]
+
+
+.loop32:
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+	vpxor	xp1, xp1
+	vpxor	xp2, xp2
+	vpxor	xp3, xp3
+	vpxor	xp4, xp4
+	vpxor	xp5, xp5
+	vpxor	xp6, xp6
+
+.next_vect:
+	mov	ptr, [src+vec_i]
+	XLDR	x0, [ptr+pos]		;Get next source vector
+	add	vec_i, PS
+
+	vpand	xgft3_lo, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xgft3_lo, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xgft3_lo, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+						;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vskip1*1]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+vskip1*2]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	lea	ptr, [vskip1 + vskip1*4]	;ptr = vskip5
+
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp1, xgft1_hi		;xp1 += partial
+
+	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	xp2, xgft2_hi		;xp2 += partial
+
+	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpxor	xp3, xgft3_hi		;xp3 += partial
+
+
+	vmovdqu	xgft1_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+	vmovdqu	xgft2_lo, [tmp+vskip1*4]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+						;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	vmovdqu	xgft3_lo, [tmp+ptr]		;Load array Fx{00}, Fx{01}, ..., Fx{0f}
+						;     "     Fx{00}, Fx{10}, ..., Fx{f0}
+	add	tmp, 32
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+
+	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpxor	xp4, xgft1_hi		;xp4 += partial
+
+	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	xp5, xgft2_hi		;xp5 += partial
+
+	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpxor	xp6, xgft3_hi		;xp6 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+
+	mov	tmp, [dest+2*PS]
+	mov	ptr, [dest+3*PS]
+	mov	vec_i, [dest+4*PS]
+
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	XSTR	[tmp+pos], xp3
+	mov	tmp, [dest+5*PS]
+	XSTR	[ptr+pos], xp4
+	XSTR	[vec_i+pos], xp5
+	XSTR	[tmp+pos], xp6
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop32		;Do one more overlap pass
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func                   core, ver, snum
+slversion gf_6vect_dot_prod_avx2, 04,  04,  019a
--- a/erasure_code/gf_6vect_dot_prod_sse.asm
+++ b/erasure_code/gf_6vect_dot_prod_sse.asm
@ -0,0 +1,315 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define tmp3  r13		; must be saved and restored
+ %define tmp4  r12		; must be saved and restored
+ %define tmp5  r14		; must be saved and restored
+ %define tmp6  r15		; must be saved and restored
+ %define return rax
+ %define PS 8
+ %define LOG_PS 3
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+ %define tmp    r11
+ %define tmp2   r10
+ %define tmp3   r13		; must be saved and restored
+ %define tmp4   r14		; must be saved and restored
+ %define tmp5   rdi		; must be saved and restored
+ %define tmp6   rsi		; must be saved and restored
+ %define return rax
+ %define PS     8
+ %define LOG_PS 3
+ %define stack_size  10*16 + 7*8		; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm8, 2*16
+	save_xmm128	xmm9, 3*16
+	save_xmm128	xmm10, 4*16
+	save_xmm128	xmm11, 5*16
+	save_xmm128	xmm12, 6*16
+	save_xmm128	xmm13, 7*16
+	save_xmm128	xmm14, 8*16
+	save_xmm128	xmm15, 9*16
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	save_reg	rsi,  10*16 + 5*8
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm8, [rsp + 2*16]
+	movdqa	xmm9, [rsp + 3*16]
+	movdqa	xmm10, [rsp + 4*16]
+	movdqa	xmm11, [rsp + 5*16]
+	movdqa	xmm12, [rsp + 6*16]
+	movdqa	xmm13, [rsp + 7*16]
+	movdqa	xmm14, [rsp + 8*16]
+	movdqa	xmm15, [rsp + 9*16]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	mov	rsi,  [rsp + 10*16 + 5*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%define len    arg0
+%define vec    arg1
+%define mul_array arg2
+%define	src    arg3
+%define dest   arg4
+%define ptr    arg5
+%define vec_i  tmp2
+%define dest1  tmp3
+%define dest2  tmp4
+%define vskip1 tmp5
+%define vskip3 tmp6
+%define pos    return
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   xmm15
+%define xgft1_lo  xmm2
+%define xgft1_hi  xmm3
+%define xgft2_lo  xmm4
+%define xgft2_hi  xmm5
+%define xgft3_lo  xmm6
+%define xgft3_hi  xmm7
+%define x0     xmm0
+%define xtmpa  xmm1
+%define xp1    xmm8
+%define xp2    xmm9
+%define xp3    xmm10
+%define xp4    xmm11
+%define xp5    xmm12
+%define xp6    xmm13
+
+align 16
+global gf_6vect_dot_prod_sse:function
+func(gf_6vect_dot_prod_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	vskip1, vec
+	imul	vskip1, 32
+	mov	vskip3, vec
+	imul	vskip3, 96
+	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	mov	dest1, [dest]
+	mov	dest2, [dest+PS]
+
+
+.loop16:
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+	pxor	xp1, xp1
+	pxor	xp2, xp2
+	pxor	xp3, xp3
+	pxor	xp4, xp4
+	pxor	xp5, xp5
+	pxor	xp6, xp6
+
+.next_vect:
+	mov	ptr, [src+vec_i]
+	add	vec_i, PS
+	XLDR	x0, [ptr+pos]		;Get next source vector
+
+	movdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	movdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	movdqu	xgft2_lo, [tmp+vskip1*1]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vskip1*1+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	movdqu	xgft3_lo, [tmp+vskip1*2]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	movdqu	xgft3_hi, [tmp+vskip1*2+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	lea	ptr, [vskip1 + vskip1*4]	;ptr = vskip5
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	pshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	pxor	xp1, xgft1_hi		;xp1 += partial
+
+	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	pxor	xp2, xgft2_hi		;xp2 += partial
+
+	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	pxor	xp3, xgft3_hi		;xp3 += partial
+
+
+	movdqu	xgft1_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	movdqu	xgft1_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+	movdqu	xgft2_lo, [tmp+vskip1*4]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+	movdqu	xgft2_hi, [tmp+vskip1*4+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	movdqu	xgft3_lo, [tmp+ptr]		;Load array Fx{00}, Fx{01}, ..., Fx{0f}
+	movdqu	xgft3_hi, [tmp+ptr+16]		;     "     Fx{00}, Fx{10}, ..., Fx{f0}
+	add	tmp, 32
+
+
+	pshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	pxor	xp4, xgft1_hi		;xp4 += partial
+
+	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	pxor	xp5, xgft2_hi		;xp5 += partial
+
+	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	pxor	xp6, xgft3_hi		;xp6 += partial
+
+	cmp	vec_i, vec
+	jl	.next_vect
+
+
+	mov	tmp, [dest+2*PS]
+	mov	ptr, [dest+3*PS]
+	mov	vec_i, [dest+4*PS]
+
+	XSTR	[dest1+pos], xp1
+	XSTR	[dest2+pos], xp2
+	XSTR	[tmp+pos], xp3
+	mov	tmp, [dest+5*PS]
+	XSTR	[ptr+pos], xp4
+	XSTR	[vec_i+pos], xp5
+	XSTR	[tmp+pos], xp6
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop16		;Do one more overlap pass
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func                  core, ver, snum
+slversion gf_6vect_dot_prod_sse, 00,  05,  0066
--- a/erasure_code/gf_6vect_dot_prod_sse_perf.c
+++ b/erasure_code/gf_6vect_dot_prod_sse_perf.c
@ -0,0 +1,352 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "test.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_6vect_dot_prod_sse
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   40000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS 1000
+#  endif
+# endif
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j;
+	void *buf;
+	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
+	u8 g4[TEST_SOURCES], g5[TEST_SOURCES], g6[TEST_SOURCES], *g_tbls;
+	u8 *dest1, *dest2, *dest3, *dest4, *dest5, *dest6, *dest_ref1;
+	u8 *dest_ref2, *dest_ref3, *dest_ref4, *dest_ref5, *dest_ref6;
+	u8 *dest_ptrs[6], *buffs[TEST_SOURCES];
+	struct perf start, stop;
+
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 16, 6 * TEST_SOURCES * 32)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	g_tbls = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest4 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest5 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest6 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref4 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref5 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref6 = buf;
+
+	dest_ptrs[0] = dest1;
+	dest_ptrs[1] = dest2;
+	dest_ptrs[2] = dest3;
+	dest_ptrs[3] = dest4;
+	dest_ptrs[4] = dest5;
+	dest_ptrs[5] = dest6;
+
+	// Performance test
+	for (i = 0; i < TEST_SOURCES; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	memset(dest1, 0, TEST_LEN);
+	memset(dest2, 0, TEST_LEN);
+	memset(dest3, 0, TEST_LEN);
+	memset(dest4, 0, TEST_LEN);
+	memset(dest5, 0, TEST_LEN);
+	memset(dest6, 0, TEST_LEN);
+	memset(dest_ref1, 0, TEST_LEN);
+	memset(dest_ref2, 0, TEST_LEN);
+	memset(dest_ref3, 0, TEST_LEN);
+	memset(dest_ref4, 0, TEST_LEN);
+	memset(dest_ref5, 0, TEST_LEN);
+	memset(dest_ref6, 0, TEST_LEN);
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		g1[i] = rand();
+		g2[i] = rand();
+		g3[i] = rand();
+		g4[i] = rand();
+		g5[i] = rand();
+		g6[i] = rand();
+	}
+
+	for (j = 0; j < TEST_SOURCES; j++) {
+		gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+		gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+		gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+		gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+		gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
+		gf_vect_mul_init(g6[j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
+	}
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
+			      dest_ref2);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
+			      dest_ref3);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
+			      dest_ref4);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
+			      dest_ref5);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES], buffs,
+			      dest_ref6);
+
+#ifdef DO_REF_PERF
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS / 20; i++) {
+		for (j = 0; j < TEST_SOURCES; j++) {
+			gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+			gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g6[j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
+		}
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      buffs, dest_ref2);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
+				      buffs, dest_ref3);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
+				      buffs, dest_ref4);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
+				      buffs, dest_ref5);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES],
+				      buffs, dest_ref6);
+	}
+	perf_stop(&stop);
+	printf("gf_6vect_dot_prod_base" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 6) * i);
+#endif
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		for (j = 0; j < TEST_SOURCES; j++) {
+			gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
+			gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(g6[j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
+		}
+
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 6) * i);
+
+	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref1, 25);
+		printf("dprod_dut:");
+		dump(dest1, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref2, 25);
+		printf("dprod_dut:");
+		dump(dest2, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref3, 25);
+		printf("dprod_dut:");
+		dump(dest3, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test4\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref4, 25);
+		printf("dprod_dut:");
+		dump(dest4, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test5\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref5, 25);
+		printf("dprod_dut:");
+		dump(dest5, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
+		printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test6\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref6, 25);
+		printf("dprod_dut:");
+		dump(dest6, 25);
+		return -1;
+	}
+
+	printf("pass perf check\n");
+	return 0;
+
+}
--- a/erasure_code/gf_6vect_dot_prod_sse_test.c
+++ b/erasure_code/gf_6vect_dot_prod_sse_test.c
@ -0,0 +1,911 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_6vect_dot_prod_sse
+#endif
+#ifndef TEST_MIN_SIZE
+# define TEST_MIN_SIZE  16
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+#define TEST_MEM  TEST_SIZE
+#define TEST_LOOPS 20000
+#define TEST_TYPE_STR ""
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  16
+#endif
+#ifndef RANDOMS
+# define RANDOMS 20
+#endif
+
+#ifdef EC_ALIGNED_ADDR
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 0
+# define LEN_ALIGN_CHK_B 0	// 0 for aligned only
+#else
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 32
+# define LEN_ALIGN_CHK_B 32	// 0 for aligned only
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, srcs;
+	void *buf;
+	u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
+	u8 g4[TEST_SOURCES], g5[TEST_SOURCES], g6[TEST_SOURCES], *g_tbls;
+	u8 *dest1, *dest2, *dest3, *dest4, *dest5, *dest6, *dest_ref1;
+	u8 *dest_ref2, *dest_ref3, *dest_ref4, *dest_ref5, *dest_ref6;
+	u8 *dest_ptrs[6], *buffs[TEST_SOURCES];
+
+	int align, size;
+	unsigned char *efence_buffs[TEST_SOURCES];
+	unsigned int offset;
+	u8 *ubuffs[TEST_SOURCES];
+	u8 *udest_ptrs[6];
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 16, 2 * (6 * TEST_SOURCES * 32))) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	g_tbls = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest4 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest5 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest6 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref1 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref2 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref3 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref4 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref5 = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref6 = buf;
+
+	dest_ptrs[0] = dest1;
+	dest_ptrs[1] = dest2;
+	dest_ptrs[2] = dest3;
+	dest_ptrs[3] = dest4;
+	dest_ptrs[4] = dest5;
+	dest_ptrs[5] = dest6;
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	memset(dest1, 0, TEST_LEN);
+	memset(dest2, 0, TEST_LEN);
+	memset(dest3, 0, TEST_LEN);
+	memset(dest4, 0, TEST_LEN);
+	memset(dest5, 0, TEST_LEN);
+	memset(dest6, 0, TEST_LEN);
+	memset(dest_ref1, 0, TEST_LEN);
+	memset(dest_ref2, 0, TEST_LEN);
+	memset(dest_ref3, 0, TEST_LEN);
+	memset(dest_ref4, 0, TEST_LEN);
+	memset(dest_ref5, 0, TEST_LEN);
+	memset(dest_ref6, 0, TEST_LEN);
+	memset(g1, 2, TEST_SOURCES);
+	memset(g2, 1, TEST_SOURCES);
+	memset(g3, 7, TEST_SOURCES);
+	memset(g4, 9, TEST_SOURCES);
+	memset(g5, 4, TEST_SOURCES);
+	memset(g6, 0xe6, TEST_SOURCES);
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+		gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+		gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
+		gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
+		gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]);
+		gf_vect_mul_init(g5[i], &g_tbls[128 * TEST_SOURCES + i * 32]);
+		gf_vect_mul_init(g6[i], &g_tbls[160 * TEST_SOURCES + i * 32]);
+	}
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
+			      dest_ref2);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
+			      dest_ref3);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
+			      dest_ref4);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
+			      dest_ref5);
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES], buffs,
+			      dest_ref6);
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+	if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref1, 25);
+		printf("dprod_dut:");
+		dump(dest1, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref2, 25);
+		printf("dprod_dut:");
+		dump(dest2, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref3, 25);
+		printf("dprod_dut:");
+		dump(dest3, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref4, 25);
+		printf("dprod_dut:");
+		dump(dest4, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test5\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref5, 25);
+		printf("dprod_dut:");
+		dump(dest5, 25);
+		return -1;
+	}
+	if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test6\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref6, 25);
+		printf("dprod_dut:");
+		dump(dest6, 25);
+		return -1;
+	}
+	putchar('.');
+
+	// Rand data test
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+			g4[i] = rand();
+			g5[i] = rand();
+			g6[i] = rand();
+		}
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g6[i], &g_tbls[(160 * TEST_SOURCES) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      buffs, dest_ref2);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
+				      buffs, dest_ref3);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
+				      buffs, dest_ref4);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
+				      buffs, dest_ref5);
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES],
+				      buffs, dest_ref6);
+
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
+
+		if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(dest1, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(dest2, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, 25);
+			printf("dprod_dut:");
+			dump(dest3, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref4, 25);
+			printf("dprod_dut:");
+			dump(dest4, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref5, 25);
+			printf("dprod_dut:");
+			dump(dest5, 25);
+			return -1;
+		}
+		if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test6 %d\n", rtest);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref6, 25);
+			printf("dprod_dut:");
+			dump(dest6, 25);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Rand data test with varied parameters
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
+			for (i = 0; i < srcs; i++)
+				for (j = 0; j < TEST_LEN; j++)
+					buffs[i][j] = rand();
+
+			for (i = 0; i < srcs; i++) {
+				g1[i] = rand();
+				g2[i] = rand();
+				g3[i] = rand();
+				g4[i] = rand();
+				g5[i] = rand();
+				g6[i] = rand();
+			}
+
+			for (i = 0; i < srcs; i++) {
+				gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+				gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+				gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
+				gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
+				gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
+				gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]);
+			}
+
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
+					      dest_ref2);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
+					      dest_ref3);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs,
+					      dest_ref4);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[128 * srcs], buffs,
+					      dest_ref5);
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[160 * srcs], buffs,
+					      dest_ref6);
+
+			FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
+
+			if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test1 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref1, 25);
+				printf("dprod_dut:");
+				dump(dest1, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test2 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref2, 25);
+				printf("dprod_dut:");
+				dump(dest2, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test3 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref3, 25);
+				printf("dprod_dut:");
+				dump(dest3, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test4 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref4, 25);
+				printf("dprod_dut:");
+				dump(dest4, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test5 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref5, 25);
+				printf("dprod_dut:");
+				dump(dest5, 25);
+				return -1;
+			}
+			if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test6 srcs=%d\n", srcs);
+				dump_matrix(buffs, 5, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref6, 25);
+				printf("dprod_dut:");
+				dump(dest6, 25);
+				return -1;
+			}
+
+			putchar('.');
+		}
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+	for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)	// Line up TEST_SIZE from end
+			efence_buffs[i] = buffs[i] + TEST_LEN - size;
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+			g4[i] = rand();
+			g5[i] = rand();
+			g6[i] = rand();
+		}
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
+			gf_vect_mul_init(g6[i], &g_tbls[(160 * TEST_SOURCES) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
+				      efence_buffs, dest_ref2);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
+				      efence_buffs, dest_ref3);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
+				      efence_buffs, dest_ref4);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
+				      efence_buffs, dest_ref5);
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES],
+				      efence_buffs, dest_ref6);
+
+		FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
+
+		if (0 != memcmp(dest_ref1, dest1, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, align);
+			printf("dprod_dut:");
+			dump(dest1, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref2, dest2, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, align);
+			printf("dprod_dut:");
+			dump(dest2, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref3, dest3, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, align);
+			printf("dprod_dut:");
+			dump(dest3, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref4, dest4, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref4, align);
+			printf("dprod_dut:");
+			dump(dest4, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref5, dest5, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref5, align);
+			printf("dprod_dut:");
+			dump(dest5, align);
+			return -1;
+		}
+
+		if (0 != memcmp(dest_ref6, dest6, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test6 %d\n", rtest);
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref6, align);
+			printf("dprod_dut:");
+			dump(dest6, align);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test rand ptr alignment if available
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
+		srcs = rand() % TEST_SOURCES;
+		if (srcs == 0)
+			continue;
+
+		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
+		// Add random offsets
+		for (i = 0; i < srcs; i++)
+			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[4] = dest5 + (rand() & (PTR_ALIGN_CHK_B - offset));
+		udest_ptrs[5] = dest6 + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		memset(dest1, 0, TEST_LEN);	// zero pad to check write-over
+		memset(dest2, 0, TEST_LEN);
+		memset(dest3, 0, TEST_LEN);
+		memset(dest4, 0, TEST_LEN);
+		memset(dest5, 0, TEST_LEN);
+		memset(dest6, 0, TEST_LEN);
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				ubuffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+			g4[i] = rand();
+			g5[i] = rand();
+			g6[i] = rand();
+		}
+
+		for (i = 0; i < srcs; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], ubuffs, dest_ref5);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[160 * srcs], ubuffs, dest_ref6);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
+
+		if (memcmp(dest_ref1, udest_ptrs[0], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[0], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref2, udest_ptrs[1], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[1], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref3, udest_ptrs[2], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[2], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref4, udest_ptrs[3], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref4, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[3], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref5, udest_ptrs[4], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref5, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[4], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref6, udest_ptrs[5], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref6, 25);
+			printf("dprod_dut:");
+			dump(udest_ptrs[5], 25);
+			return -1;
+		}
+		// Confirm that padding around dests is unchanged
+		memset(dest_ref1, 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
+		offset = udest_ptrs[0] - dest1;
+
+		if (memcmp(dest1, dest_ref1, offset)) {
+			printf("Fail rand ualign pad1 start\n");
+			return -1;
+		}
+		if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad1 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[1] - dest2;
+		if (memcmp(dest2, dest_ref1, offset)) {
+			printf("Fail rand ualign pad2 start\n");
+			return -1;
+		}
+		if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad2 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[2] - dest3;
+		if (memcmp(dest3, dest_ref1, offset)) {
+			printf("Fail rand ualign pad3 start\n");
+			return -1;
+		}
+		if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad3 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[3] - dest4;
+		if (memcmp(dest4, dest_ref1, offset)) {
+			printf("Fail rand ualign pad4 start\n");
+			return -1;
+		}
+		if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad4 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[4] - dest5;
+		if (memcmp(dest5, dest_ref1, offset)) {
+			printf("Fail rand ualign pad5 start\n");
+			return -1;
+		}
+		if (memcmp(dest5 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad5 end\n");
+			return -1;
+		}
+
+		offset = udest_ptrs[5] - dest6;
+		if (memcmp(dest6, dest_ref1, offset)) {
+			printf("Fail rand ualign pad6 start\n");
+			return -1;
+		}
+		if (memcmp(dest6 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad6 end\n");
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test all size alignment
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+
+	for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
+		srcs = TEST_SOURCES;
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++) {
+			g1[i] = rand();
+			g2[i] = rand();
+			g3[i] = rand();
+			g4[i] = rand();
+			g5[i] = rand();
+			g6[i] = rand();
+		}
+
+		for (i = 0; i < srcs; i++) {
+			gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
+			gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
+			gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]);
+		}
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], buffs, dest_ref5);
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[160 * srcs], buffs, dest_ref6);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
+
+		if (memcmp(dest_ref1, dest_ptrs[0], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref1, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[0], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref2, dest_ptrs[1], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref2, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[1], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref3, dest_ptrs[2], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref3, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[2], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref4, dest_ptrs[3], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref4, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[3], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref5, dest_ptrs[4], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref5, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[4], 25);
+			return -1;
+		}
+		if (memcmp(dest_ref6, dest_ptrs[5], size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref6, 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[5], 25);
+			return -1;
+		}
+	}
+
+	printf("Pass\n");
+	return 0;
+
+}
--- a/erasure_code/gf_6vect_mad_avx.asm
+++ b/erasure_code/gf_6vect_mad_avx.asm
@ -0,0 +1,394 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define tmp4   r14
+ %define tmp5   rdi
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 5*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define tmp4   r13
+ %define tmp5   r14
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_6vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp4
+%define dest3 tmp2
+%define dest4 mul_array
+%define dest5 tmp5
+%define dest6 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft4_lo  xmm14
+%define xgft4_hi  xmm13
+%define xgft5_lo  xmm12
+%define xgft5_hi  xmm11
+%define xgft6_lo  xmm10
+%define xgft6_hi  xmm9
+
+%define x0         xmm0
+%define xtmpa      xmm1
+%define xtmph1     xmm2
+%define xtmpl1     xmm3
+%define xtmph2     xmm4
+%define xtmpl2     xmm5
+%define xtmph3     xmm6
+%define xtmpl3     xmm7
+%define xd1        xmm8
+%define xd2        xtmpl1
+%define xd3        xtmph1
+
+
+align 16
+global gf_6vect_mad_avx:function
+func(gf_6vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	tmp, vec
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+	sal	tmp, 6			;Multiply by 64
+
+	sal	vec, 5			;Multiply by 32
+	lea	vec_i, [tmp + vec]	;vec_i = vec*96
+	lea	mul_array, [tmp + vec_i]	;mul_array = vec*160
+
+	vmovdqu	xgft5_lo, [tmp3+2*tmp]		;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+	vmovdqu	xgft5_hi, [tmp3+2*tmp+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	vmovdqu	xgft4_lo, [tmp3+vec_i]		;Load array Dx{00}, Dx{01}, Dx{02}, ...
+	vmovdqu	xgft4_hi, [tmp3+vec_i+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+	vmovdqu	xgft6_lo, [tmp3+mul_array]	;Load array Fx{00}, Fx{01}, ..., Fx{0f}
+	vmovdqu	xgft6_hi, [tmp3+mul_array+16]	;     "     Fx{00}, Fx{10}, ..., Fx{f0}
+
+	mov	dest2, [dest1+PS]
+	mov	dest3, [dest1+2*PS]
+	mov	dest4, [dest1+3*PS]  ; reuse mul_array
+	mov	dest5, [dest1+4*PS]
+	mov	dest6, [dest1+5*PS]  ; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vmovdqu	xtmpl1, [tmp3]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+
+	;dest1
+	vpshufb	xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xtmph1
+
+	XLDR	xd2, [dest2+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+pos]	;reuse xtmph1. Get next dest vector
+
+	;dest2
+	vpshufb	xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xtmph2
+
+	;dest3
+	vpshufb	xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xtmph3
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+
+	;dest4
+	XLDR	xd1, [dest4+pos]		;Get next dest vector
+	vpshufb	xtmph1, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1
+
+	XLDR	xd2, [dest5+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest6+pos]	;reuse xtmph1. Get next dest vector
+
+	;dest5
+	vpshufb	xtmph2, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2
+
+	;dest6
+	vpshufb	xtmph3, xgft6_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3
+
+	XSTR	[dest4+pos], xd1	;Store result into dest4
+	XSTR	[dest5+pos], xd2	;Store result into dest5
+	XSTR	[dest6+pos], xd3	;Store result into dest6
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	;; Overlapped offset length-16
+	mov	tmp, len		;Backup len as len=rdi
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest4+tmp]	;Get next dest vector
+	XLDR	xd2, [dest5+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest6+tmp]	;reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph3, [constip16]	;Load const of i + 16
+	vpinsrb	xtmpl3, len.w, 15
+	vpshufb	xtmpl3, xmask0f		;Broadcast len to all bytes
+	vpcmpgtb	xtmpl3, xtmpl3, xtmph3
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	;dest4
+	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_hi, xgft4_lo	;GF add high and low partials
+	vpand	xgft4_hi, xgft4_hi, xtmpl3
+	vpxor	xd1, xd1, xgft4_hi
+
+	;dest5
+	vpshufb	xgft5_hi, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft5_lo, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft5_hi, xgft5_hi, xgft5_lo	;GF add high and low partials
+	vpand	xgft5_hi, xgft5_hi, xtmpl3
+	vpxor	xd2, xd2, xgft5_hi
+
+	;dest6
+	vpshufb	xgft6_hi, xgft6_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft6_lo, xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft6_hi, xgft6_hi, xgft6_lo	;GF add high and low partials
+	vpand	xgft6_hi, xgft6_hi, xtmpl3
+	vpxor	xd3, xd3, xgft6_hi
+
+	XSTR	[dest4+tmp], xd1	;Store result into dest4
+	XSTR	[dest5+tmp], xd2	;Store result into dest5
+	XSTR	[dest6+tmp], xd3	;Store result into dest6
+
+	vmovdqu	xgft4_lo, [tmp3]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xgft4_hi, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xgft5_lo, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xgft5_hi, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xgft6_lo, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xgft6_hi, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;reuse xtmph1. Get next dest3 vector
+
+	;dest1
+	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_hi, xgft4_lo		;GF add high and low partials
+	vpand	xgft4_hi, xgft4_hi, xtmpl3
+	vpxor	xd1, xd1, xgft4_hi
+
+	;dest2
+	vpshufb	xgft5_hi, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft5_lo, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft5_hi, xgft5_hi, xgft5_lo	;GF add high and low partials
+	vpand	xgft5_hi, xgft5_hi, xtmpl3
+	vpxor	xd2, xd2, xgft5_hi
+
+	;dest3
+	vpshufb	xgft6_hi, xgft6_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft6_lo, xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft6_hi, xgft6_hi, xgft6_lo	;GF add high and low partials
+	vpand	xgft6_hi, xgft6_hi, xtmpl3
+	vpxor	xd3, xd3, xgft6_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_6vect_mad_avx, 02,  01,  0210
--- a/erasure_code/gf_6vect_mad_avx2.asm
+++ b/erasure_code/gf_6vect_mad_avx2.asm
@ -0,0 +1,400 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define tmp3   r13
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r15,  10*16 + 2*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r15,  [rsp + 10*16 + 2*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2   r10
+ %define tmp3   r12
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_6vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp3
+%define dest3 tmp2
+%define dest4 mul_array
+%define dest5 vec
+%define dest6 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft2_lo  ymm13
+%define xgft3_lo  ymm12
+%define xgft4_lo  ymm11
+%define xgft5_lo  ymm10
+%define xgft6_lo  ymm9
+
+%define x0         ymm0
+%define xtmpa      ymm1
+%define xtmpl      ymm2
+%define xtmplx     xmm2
+%define xtmph      ymm3
+%define xtmphx     xmm3
+%define xd1        ymm4
+%define xd2        ymm5
+%define xd3        ymm6
+%define xd4        ymm7
+%define xd5        ymm8
+%define xd6        xd1
+
+align 16
+global gf_6vect_mad_avx2:function
+func(gf_6vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5			;Multiply by 32
+	lea	tmp, [mul_array + vec_i]
+	mov	vec_i, vec
+	mov	mul_array, vec
+	sal	vec_i, 1
+	sal	mul_array, 1
+	add	vec_i, vec		;vec_i=vec*96
+	add	mul_array, vec_i	;vec_i=vec*160
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+						;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]		;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft4_lo, [tmp+vec_i]		;Load array Fx{00}, Fx{01}, ..., Fx{0f}
+						;     "     Fx{00}, Fx{10}, ..., Fx{f0}
+	vmovdqu	xgft5_lo, [tmp+4*vec]		;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+						;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	vmovdqu	xgft6_lo, [tmp+mul_array]	;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+
+	mov	dest2, [dest1+PS]    ; reuse tmp3
+	mov	dest3, [dest1+2*PS]  ; reuse tmp2
+	mov	dest4, [dest1+3*PS]  ; reuse mul_array
+	mov	dest5, [dest1+4*PS]  ; reuse vec
+	mov	dest6, [dest1+5*PS]  ; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+	XLDR	xd3, [dest3+pos]	;Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Get next dest vector
+	XLDR	xd5, [dest5+pos]	;Get next dest vector
+
+	vpand	xtmpl, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xtmpl, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmpl, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	;dest1
+	vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph		;xd1 += partial
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+
+	;dest2
+	vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph		;xd2 += partial
+
+	;dest3
+	vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph		;xd3 += partial
+
+	XLDR	xd6, [dest6+pos]	;reuse xd1. Get next dest vector
+
+	;dest4
+	vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph		;xd4 += partial
+
+	;dest5
+	vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd5, xd5, xtmph		;xd5 += partial
+
+	;dest6
+	vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd6, xd6, xtmph		;xd6 += partial
+
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+	XSTR	[dest4+pos], xd4	;Store result into dest4
+	XSTR	[dest5+pos], xd5	;Store result into dest5
+	XSTR	[dest6+pos], xd6	;Store result into dest6
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan32:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp.b, 0x1f
+	vpinsrb	xtmphx, xtmphx, tmp.w, 0
+	vpbroadcastb xtmph, xtmphx	;Construct mask 0x1f1f1f...
+
+	mov	tmp, len		;Overlapped offset length-32
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Get next dest vector
+	XLDR	xd5, [dest5+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vpinsrb	xtmplx, xtmplx, len.w, 15
+	vinserti128	xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
+	vpshufb	xtmpl, xtmpl, xtmph	;Broadcast len to all bytes. xtmph=0x1f1f1f...
+	vpcmpgtb	xtmpl, xtmpl, [constip32]
+
+	vpand	xtmph, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xtmph, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmph, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	;dest1
+	vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft1_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd1, xd1, xtmph		;xd1 += partial
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+
+	;dest2
+	vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft2_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd2, xd2, xtmph		;xd2 += partial
+
+	;dest3
+	vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft3_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd3, xd3, xtmph		;xd3 += partial
+
+	XLDR	xd6, [dest6+tmp]	;reuse xd1. Get next dest vector
+
+	;dest4
+	vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft4_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd4, xd4, xtmph		;xd4 += partial
+
+	;dest5
+	vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft5_lo, xgft5_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft5_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd5, xd5, xtmph		;xd5 += partial
+
+	;dest6
+	vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft6_lo, xgft6_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft6_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd6, xd6, xtmph		;xd6 += partial
+
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+	XSTR	[dest4+tmp], xd4	;Store result into dest4
+	XSTR	[dest5+tmp], xd5	;Store result into dest5
+	XSTR	[dest6+tmp], xd6	;Store result into dest6
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+align 32
+constip32:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+	ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
+
+;;;       func              core, ver, snum
+slversion gf_6vect_mad_avx2, 04,  01,  0211
--- a/erasure_code/gf_6vect_mad_sse.asm
+++ b/erasure_code/gf_6vect_mad_sse.asm
@ -0,0 +1,406 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp2   r10
+ %define tmp3   r13
+ %define tmp4   r14
+ %define tmp5   rdi
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 5*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp2   r10
+ %define tmp3   r12
+ %define tmp4   r13
+ %define tmp5   r14
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_6vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 tmp2
+%define dest4 tmp4
+%define dest5 tmp5
+%define dest6 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft4_lo  xmm14
+%define xgft4_hi  xmm13
+%define xgft5_lo  xmm12
+%define xgft5_hi  xmm11
+%define xgft6_lo  xmm10
+%define xgft6_hi  xmm9
+
+%define x0         xmm0
+%define xtmpa      xmm1
+%define xtmph1     xmm2
+%define xtmpl1     xmm3
+%define xtmph2     xmm4
+%define xtmpl2     xmm5
+%define xtmph3     xmm6
+%define xtmpl3     xmm7
+%define xd1        xmm8
+%define xd2        xtmpl1
+%define xd3        xtmph1
+
+
+align 16
+global gf_6vect_mad_sse:function
+func(gf_6vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	mov	tmp, vec
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+	sal	tmp, 6			;Multiply by 64
+
+	sal	vec, 5			;Multiply by 32
+	lea	vec_i, [tmp + vec]	;vec_i = 96
+	lea	mul_array, [tmp + vec_i]	;mul_array = 160
+
+	movdqu	xgft5_lo, [tmp3+2*tmp]		;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+	movdqu	xgft5_hi, [tmp3+2*tmp+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	movdqu	xgft4_lo, [tmp3+vec_i]		;Load array Dx{00}, Dx{01}, Dx{02}, ...
+	movdqu	xgft4_hi, [tmp3+vec_i+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+	movdqu	xgft6_lo, [tmp3+mul_array]	;Load array Fx{00}, Fx{01}, ..., Fx{0f}
+	movdqu	xgft6_hi, [tmp3+mul_array+16]	;     "     Fx{00}, Fx{10}, ..., Fx{f0}
+
+	mov	dest2, [dest1+PS]
+	mov	dest3, [dest1+2*PS]
+	mov	dest4, [dest1+3*PS]  ; reuse mul_array
+	mov	dest5, [dest1+4*PS]
+	mov	dest6, [dest1+5*PS]  ; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	movdqu	xtmpl1, [tmp3]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	;dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd2, [dest2+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+pos]	;reuse xtmph1. Get next dest3 vector
+
+	;dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	;dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+
+	movdqa	xtmph1, xgft4_hi	;Reload const array registers
+	movdqa	xtmpl1, xgft4_lo	;Reload const array registers
+	movdqa	xtmph2, xgft5_hi	;Reload const array registers
+	movdqa	xtmpl2, xgft5_lo	;Reload const array registers
+	movdqa	xtmph3, xgft6_hi	;Reload const array registers
+	movdqa	xtmpl3, xgft6_lo	;Reload const array registers
+
+	;dest4
+	XLDR	xd1, [dest4+pos]	;Get next dest vector
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd2, [dest5+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest6+pos]	;reuse xtmph1. Get next dest vector
+
+	;dest5
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	;dest6
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	XSTR	[dest4+pos], xd1	;Store result into dest4
+	XSTR	[dest5+pos], xd2	;Store result into dest5
+	XSTR	[dest6+pos], xd3	;Store result into dest6
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	;; Overlapped offset length-16
+	mov	tmp, len		;Backup len as len=rdi
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest4+tmp]	;Get next dest vector
+	XLDR	xd2, [dest5+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest6+tmp]	;reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	movdqa	xtmph3, [constip16]	;Load const of i + 16
+	pinsrb	xtmpl3, len.w, 15
+	pshufb	xtmpl3, xmask0f		;Broadcast len to all bytes
+	pcmpgtb	xtmpl3, xtmph3
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	;dest4
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo	;GF add high and low partials
+	pand	xgft4_hi, xtmpl3
+	pxor	xd1, xgft4_hi
+
+	;dest5
+	pshufb	xgft5_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft5_hi, xgft5_lo	;GF add high and low partials
+	pand	xgft5_hi, xtmpl3
+	pxor	xd2, xgft5_hi
+
+	;dest6
+	pshufb	xgft6_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft6_hi, xgft6_lo	;GF add high and low partials
+	pand	xgft6_hi, xtmpl3
+	pxor	xd3, xgft6_hi
+
+	XSTR	[dest4+tmp], xd1	;Store result into dest4
+	XSTR	[dest5+tmp], xd2	;Store result into dest5
+	XSTR	[dest6+tmp], xd3	;Store result into dest6
+
+	movdqu	xgft4_lo, [tmp3]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xgft4_hi, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xgft5_lo, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xgft5_hi, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xgft6_lo, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xgft6_hi, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;reuse xtmph1. Get next dest3 vector
+
+	;dest1
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo		;GF add high and low partials
+	pand	xgft4_hi, xtmpl3
+	pxor	xd1, xgft4_hi
+
+	;dest2
+	pshufb	xgft5_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft5_hi, xgft5_lo	;GF add high and low partials
+	pand	xgft5_hi, xtmpl3
+	pxor	xd2, xgft5_hi
+
+	;dest3
+	pshufb	xgft6_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft6_hi, xgft6_lo	;GF add high and low partials
+	pand	xgft6_hi, xtmpl3
+	pxor	xd3, xgft6_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+;;;       func             core, ver, snum
+slversion gf_6vect_mad_sse, 00,  01,  020f
--- a/erasure_code/gf_inverse_test.c
+++ b/erasure_code/gf_inverse_test.c
@ -0,0 +1,225 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include <assert.h>
+
+#include "erasure_code.h"
+
+#define TEST_LEN 8192
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  128
+#endif
+#ifndef RANDOMS
+# define RANDOMS 200
+#endif
+
+#define KMAX TEST_SOURCES
+
+typedef unsigned char u8;
+
+void matrix_mult(u8 * a, u8 * b, u8 * c, int n)
+{
+	int i, j, k;
+	u8 d;
+
+	for (i = 0; i < n; i++) {
+		for (j = 0; j < n; j++) {
+			d = 0;
+			for (k = 0; k < n; k++) {
+				d ^= gf_mul(a[n * i + k], b[n * k + j]);
+			}
+			c[i * n + j] = d;
+		}
+	}
+}
+
+void print_matrix(u8 * a, int n)
+{
+	int i, j;
+
+	for (i = 0; i < n; i++) {
+		for (j = 0; j < n; j++) {
+			printf(" %2x", a[i * n + j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int is_ident(u8 * a, const int n)
+{
+	int i, j;
+	u8 c;
+	for (i = 0; i < n; i++) {
+		for (j = 0; j < n; j++) {
+			c = *a++;
+			if (i == j)
+				c--;
+			if (c != 0)
+				return -1;
+		}
+	}
+	return 0;
+}
+
+int inv_test(u8 * in, u8 * inv, u8 * sav, int n)
+{
+	memcpy(sav, in, n * n);
+
+	if (gf_invert_matrix(in, inv, n)) {
+		printf("Given singular matrix\n");
+		print_matrix(sav, n);
+		return -1;
+	}
+
+	matrix_mult(inv, sav, in, n);
+
+	if (is_ident(in, n)) {
+		printf("fail\n");
+		print_matrix(sav, n);
+		print_matrix(inv, n);
+		print_matrix(in, n);
+		return -1;
+	}
+	putchar('.');
+
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int i, k, t;
+	u8 *test_mat, *save_mat, *invr_mat;
+
+	u8 test1[] = { 1, 1, 6,
+		1, 1, 1,
+		7, 1, 9
+	};
+
+	u8 test2[] = { 0, 1, 6,
+		1, 0, 1,
+		0, 1, 9
+	};
+
+	u8 test3[] = { 0, 0, 1,
+		1, 0, 0,
+		0, 1, 1
+	};
+
+	u8 test4[] = { 0, 1, 6, 7,
+		1, 1, 0, 0,
+		0, 1, 2, 3,
+		3, 2, 2, 3
+	};			// = row3+3*row2
+
+	printf("gf_inverse_test: max=%d ", KMAX);
+
+	test_mat = malloc(KMAX * KMAX);
+	save_mat = malloc(KMAX * KMAX);
+	invr_mat = malloc(KMAX * KMAX);
+
+	if (NULL == test_mat || NULL == save_mat || NULL == invr_mat)
+		return -1;
+
+	// Test with lots of leading 1's
+	k = 3;
+	memcpy(test_mat, test1, k * k);
+	if (inv_test(test_mat, invr_mat, save_mat, k))
+		return -1;
+
+	// Test with leading zeros
+	k = 3;
+	memcpy(test_mat, test2, k * k);
+	if (inv_test(test_mat, invr_mat, save_mat, k))
+		return -1;
+
+	// Test 3
+	k = 3;
+	memcpy(test_mat, test3, k * k);
+	if (inv_test(test_mat, invr_mat, save_mat, k))
+		return -1;
+
+	// Test 4 - try a singular matrix
+	k = 4;
+	memcpy(test_mat, test4, k * k);
+	if (!gf_invert_matrix(test_mat, invr_mat, k)) {
+		printf("Fail: didn't catch singular matrix\n");
+		print_matrix(test4, 4);
+		return -1;
+	}
+	// Do random test of size KMAX
+	k = KMAX;
+
+	for (i = 0; i < k * k; i++)
+		test_mat[i] = save_mat[i] = rand();
+
+	if (gf_invert_matrix(test_mat, invr_mat, k)) {
+		printf("rand picked a singular matrix, try again\n");
+		return -1;
+	}
+
+	matrix_mult(invr_mat, save_mat, test_mat, k);
+
+	if (is_ident(test_mat, k)) {
+		printf("fail\n");
+		print_matrix(save_mat, k);
+		print_matrix(invr_mat, k);
+		print_matrix(test_mat, k);
+		return -1;
+	}
+	// Do Randoms.  Random size and coefficients
+	for (t = 0; t < RANDOMS; t++) {
+		k = rand() % KMAX;
+
+		for (i = 0; i < k * k; i++)
+			test_mat[i] = save_mat[i] = rand();
+
+		if (gf_invert_matrix(test_mat, invr_mat, k))
+			continue;
+
+		matrix_mult(invr_mat, save_mat, test_mat, k);
+
+		if (is_ident(test_mat, k)) {
+			printf("fail rand k=%d\n", k);
+			print_matrix(save_mat, k);
+			print_matrix(invr_mat, k);
+			print_matrix(test_mat, k);
+			return -1;
+		}
+		if (0 == (t % 8))
+			putchar('.');
+	}
+
+	printf(" Pass\n");
+	return 0;
+}
--- a/erasure_code/gf_vect_dot_prod_1tbl.c
+++ b/erasure_code/gf_vect_dot_prod_1tbl.c
@ -0,0 +1,166 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "test.h"
+#include "erasure_code.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   4000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     GT_L3_CACHE / TEST_SOURCES
+#  define TEST_LOOPS   10
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS 1000
+#  endif
+# endif
+#endif
+
+typedef unsigned char u8;
+
+// Global GF(256) tables
+u8 gff[256];
+u8 gflog[256];
+u8 gf_mul_table[256 * 256];
+
+void mk_gf_field(void)
+{
+	int i;
+	u8 s = 1;
+	gflog[0] = 0;
+
+	for (i = 0; i < 256; i++) {
+		gff[i] = s;
+		gflog[s] = i;
+		s = (s << 1) ^ ((s & 0x80) ? 0x1d : 0);	// mult by GF{2}
+	}
+}
+
+void mk_gf_mul_table(u8 * table)
+{
+	// Populate a single table with all multiply combinations for a fast,
+	// single-table lookup of GF(2^8) multiply at the expense of memory.
+	int i, j;
+	for (i = 0; i < 256; i++)
+		for (j = 0; j < 256; j++)
+			table[i * 256 + j] = gf_mul(i, j);
+}
+
+void gf_vect_dot_prod_ref(int len, int vlen, u8 * v, u8 ** src, u8 * dest)
+{
+	int i, j;
+	u8 s;
+	for (i = 0; i < len; i++) {
+		s = 0;
+		for (j = 0; j < vlen; j++)
+			s ^= gf_mul(src[j][i], v[j]);
+
+		dest[i] = s;
+	}
+}
+
+int main(void)
+{
+	int i, j, k;
+	u8 s, vec[TEST_SOURCES], dest1[TEST_LEN], dest2[TEST_LEN];
+	u8 *matrix[TEST_SOURCES];
+	struct perf start, stop;
+
+	mk_gf_field();
+	mk_gf_mul_table(gf_mul_table);
+
+	//generate random vector and matrix/data
+	for (i = 0; i < TEST_SOURCES; i++) {
+		vec[i] = rand();
+
+		if (!(matrix[i] = malloc(TEST_LEN))) {
+			fprintf(stderr, "Error failure\n\n");
+			return -1;
+		}
+		for (j = 0; j < TEST_LEN; j++)
+			matrix[i][j] = rand();
+
+	}
+
+	gf_vect_dot_prod_ref(TEST_LEN, TEST_SOURCES, vec, matrix, dest1);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++)
+		gf_vect_dot_prod_ref(TEST_LEN, TEST_SOURCES, vec, matrix, dest1);
+
+	perf_stop(&stop);
+	printf("gf_vect_dot_prod_2tbl" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
+
+	// Warm up mult tables
+	for (i = 0; i < TEST_LEN; i++) {
+		s = 0;
+		for (j = 0; j < TEST_SOURCES; j++) {
+			s ^= gf_mul_table[vec[j] * 256 + matrix[j][i]];
+		}
+		dest2[i] = s;
+	}
+
+	perf_start(&start);
+	for (k = 0; k < TEST_LOOPS; k++) {
+		for (i = 0; i < TEST_LEN; i++) {
+			s = 0;
+			for (j = 0; j < TEST_SOURCES; j++) {
+				s ^= gf_mul_table[vec[j] * 256 + matrix[j][i]];
+			}
+			dest2[i] = s;
+		}
+	}
+	perf_stop(&stop);
+	printf("gf_vect_dot_prod_1tbl" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * k);
+
+	// Compare with reference function
+	if (0 != memcmp(dest1, dest2, TEST_LEN)) {
+		printf("Error, different results!\n\n");
+		return -1;
+	}
+
+	printf("Pass functional test\n");
+	return 0;
+}
--- a/erasure_code/gf_vect_dot_prod_avx.asm
+++ b/erasure_code/gf_vect_dot_prod_avx.asm
@ -0,0 +1,271 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_dot_prod_avx(len, vec, *g_tbls, **buffs, *dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define tmp3  r9
+ %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved and loaded
+ %define tmp    r11
+ %define tmp2   r10
+ %define tmp3   rdi 		; must be saved and loaded
+ %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
+ %define PS 8
+ %define frame_size 2*8
+ %define arg(x)      [rsp + frame_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	rex_push_reg	r12
+	push_reg	rdi
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	rdi
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+
+ %define trans   ecx			;trans is for the variables in stack
+ %define arg0    trans
+ %define arg0_m  arg(0)
+ %define arg1    trans
+ %define arg1_m  arg(1)
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    ebx
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    esi
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg3, arg(3)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define len   arg0
+%define vec   arg1
+%define mul_array arg2
+%define	src   arg3
+%define dest  arg4
+
+%define vec_i tmp2
+%define ptr   tmp3
+%define pos   return
+
+ %ifidn PS,4				;32-bit code
+	%define  vec_m 	arg1_m
+	%define  len_m 	arg0_m
+	%define  dest_m arg4_m
+ %endif
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+%ifidn PS,8			; 64-bit code
+ default rel
+  [bits 64]
+%endif
+
+section .text
+
+%define xmask0f  xmm5
+%define xgft_lo  xmm4
+%define xgft_hi  xmm3
+
+%define x0     xmm0
+%define xtmpa  xmm1
+%define xp     xmm2
+
+align 16
+global gf_vect_dot_prod_avx:function
+func(gf_vect_dot_prod_avx)
+	FUNC_SAVE
+	SLDR 	len, len_m
+	sub	len, 16
+	SSTR 	len_m, len
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+.loop16:
+	vpxor	xp, xp
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect:
+
+	mov	ptr, [src+vec_i*PS]
+	vmovdqu	xgft_lo, [tmp]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	vmovdqu	xgft_hi, [tmp+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	XLDR	x0, [ptr+pos]		;Get next source vector
+
+	add	tmp, 32
+	add	vec_i, 1
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xgft_hi, xgft_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xgft_lo, xgft_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials
+	vpxor	xp, xp, xgft_hi		;xp += partial
+
+	SLDR	vec, vec_m
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	SLDR 	dest, dest_m
+	XSTR	[dest+pos], xp
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	SLDR 	len, len_m
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop16		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func                 core, ver, snum
+slversion gf_vect_dot_prod_avx, 02,  05,  0061
--- a/erasure_code/gf_vect_dot_prod_avx2.asm
+++ b/erasure_code/gf_vect_dot_prod_avx2.asm
@ -0,0 +1,280 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, *dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2  r10
+ %define tmp3  r9
+ %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved and loaded
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define tmp3   rdi 		; must be saved and loaded
+ %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
+ %define frame_size 2*8
+ %define arg(x)      [rsp + frame_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	rex_push_reg	r12
+	push_reg	rdi
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	rdi
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+
+ %define trans   ecx			;trans is for the variables in stack
+ %define arg0    trans
+ %define arg0_m  arg(0)
+ %define arg1    trans
+ %define arg1_m  arg(1)
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    ebx
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp.w   edx
+ %define tmp.b   dl
+ %define tmp2    edi
+ %define tmp3    esi
+ %define return  eax
+ %macro SLDR     2			;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg3, arg(3)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define len   arg0
+%define vec   arg1
+%define mul_array arg2
+%define	src   arg3
+%define dest  arg4
+
+%define vec_i tmp2
+%define ptr   tmp3
+%define pos   return
+
+%ifidn PS,4				;32-bit code
+ %define  vec_m  arg1_m
+ %define  len_m  arg0_m
+ %define  dest_m arg4_m
+%endif
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+%ifidn PS,8				;64-bit code
+ default rel
+ [bits 64]
+%endif
+
+section .text
+
+%define xmask0f  ymm3
+%define xmask0fx xmm3
+%define xgft_lo  ymm4
+%define xgft_hi  ymm5
+
+%define x0     ymm0
+%define xtmpa  ymm1
+%define xp     ymm2
+
+align 16
+global gf_vect_dot_prod_avx2:function
+func(gf_vect_dot_prod_avx2)
+	FUNC_SAVE
+	SLDR 	len, len_m
+	sub	len, 32
+	SSTR 	len_m, len
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+.loop32:
+	vpxor	xp, xp
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect:
+
+	mov	ptr, [src+vec_i*PS]
+
+	vmovdqu	xgft_lo, [tmp]		;Load array Cx{00}, Cx{01}, Cx{02}, ...
+					; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
+
+	XLDR	x0, [ptr+pos]		;Get next source vector
+
+	add	tmp, 32
+	add	vec_i, 1
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xgft_hi, xgft_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xgft_lo, xgft_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials
+	vpxor	xp, xp, xgft_hi		;xp += partial
+
+	SLDR	vec, vec_m
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	SLDR 	dest, dest_m
+	XSTR	[dest+pos], xp
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	SLDR 	len, len_m
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-32
+	jmp	.loop32		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func                  core, ver, snum
+slversion gf_vect_dot_prod_avx2, 04,  05,  0190
--- a/erasure_code/gf_vect_dot_prod_avx_perf.c
+++ b/erasure_code/gf_vect_dot_prod_avx_perf.c
@ -0,0 +1,184 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "test.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_vect_dot_prod_avx
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   40000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS 1000
+#  endif
+# endif
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j;
+	void *buf;
+	u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], *dest, *dest_ref;
+	u8 *temp_buff, *buffs[TEST_SOURCES];
+	struct perf start, stop;
+
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	temp_buff = buf;
+
+	// Performance test
+	for (i = 0; i < TEST_SOURCES; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	memset(dest, 0, TEST_LEN);
+	memset(temp_buff, 0, TEST_LEN);
+	memset(dest_ref, 0, TEST_LEN);
+	memset(g, 0, TEST_SOURCES);
+
+	for (i = 0; i < TEST_SOURCES; i++)
+		g[i] = rand();
+
+	for (j = 0; j < TEST_SOURCES; j++)
+		gf_vect_mul_init(g[j], &g_tbls[j * 32]);
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
+
+#ifdef DO_REF_PERF
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		for (j = 0; j < TEST_SOURCES; j++)
+			gf_vect_mul_init(g[j], &g_tbls[j * 32]);
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
+	}
+	perf_stop(&stop);
+	printf("gf_vect_dot_prod_base" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
+#endif
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		for (j = 0; j < TEST_SOURCES; j++)
+			gf_vect_mul_init(g[j], &g_tbls[j * 32]);
+
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
+
+	if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref, 25);
+		printf("dprod:");
+		dump(dest, 25);
+		return -1;
+	}
+
+	printf("pass perf check\n");
+	return 0;
+}
--- a/erasure_code/gf_vect_dot_prod_avx_test.c
+++ b/erasure_code/gf_vect_dot_prod_avx_test.c
@ -0,0 +1,525 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_vect_dot_prod_avx
+#endif
+#ifndef TEST_MIN_SIZE
+# define TEST_MIN_SIZE  16
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  16
+#endif
+#ifndef RANDOMS
+# define RANDOMS 20
+#endif
+
+#define MMAX TEST_SOURCES
+#define KMAX TEST_SOURCES
+
+#ifdef EC_ALIGNED_ADDR
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 0
+# define LEN_ALIGN_CHK_B 0	// 0 for aligned only
+#else
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 32
+# define LEN_ALIGN_CHK_B 32	// 0 for aligned only
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, srcs, m, k, nerrs, r, err;
+	void *buf;
+	u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
+	u8 *dest, *dest_ref, *temp_buff, *buffs[TEST_SOURCES];
+	u8 a[MMAX * KMAX], b[MMAX * KMAX], d[MMAX * KMAX];
+	u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
+
+	int align, size;
+	unsigned char *efence_buffs[TEST_SOURCES];
+	unsigned int offset;
+	u8 *ubuffs[TEST_SOURCES];
+	u8 *udest_ptr;
+
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	temp_buff = buf;
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	memset(dest, 0, TEST_LEN);
+	memset(temp_buff, 0, TEST_LEN);
+	memset(dest_ref, 0, TEST_LEN);
+	memset(g, 0, TEST_SOURCES);
+
+	for (i = 0; i < TEST_SOURCES; i++)
+		gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
+
+	if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " \n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref, 25);
+		printf("dprod:");
+		dump(dest, 25);
+		return -1;
+	} else
+		putchar('.');
+
+	// Rand data test
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)
+			g[i] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)
+			gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
+
+		if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " 1\n");
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref, 25);
+			printf("dprod:");
+			dump(dest, 25);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Rand data test with varied parameters
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
+			for (i = 0; i < srcs; i++)
+				for (j = 0; j < TEST_LEN; j++)
+					buffs[i][j] = rand();
+
+			for (i = 0; i < srcs; i++)
+				g[i] = rand();
+
+			for (i = 0; i < srcs; i++)
+				gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref);
+			FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest);
+
+			if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 2\n");
+				dump_matrix(buffs, 5, srcs);
+				printf("dprod_base:");
+				dump(dest_ref, 5);
+				printf("dprod:");
+				dump(dest, 5);
+				return -1;
+			}
+
+			putchar('.');
+		}
+	}
+
+	// Test erasure code using gf_vect_dot_prod
+
+	// Pick a first test
+	m = 9;
+	k = 5;
+	if (m > MMAX || k > KMAX)
+		return -1;
+
+	gf_gen_rs_matrix(a, m, k);
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	// Make parity vects
+	for (i = k; i < m; i++) {
+		for (j = 0; j < k; j++)
+			gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
+#ifndef USEREF
+		FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
+#else
+		gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
+#endif
+	}
+
+	// Random buffers in erasure
+	memset(src_in_err, 0, TEST_SOURCES);
+	for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
+		err = 1 & rand();
+		src_in_err[i] = err;
+		if (err)
+			src_err_list[nerrs++] = i;
+	}
+
+	// construct b by removing error rows
+	for (i = 0, r = 0; i < k; i++, r++) {
+		while (src_in_err[r]) {
+			r++;
+			continue;
+		}
+		for (j = 0; j < k; j++)
+			b[k * i + j] = a[k * r + j];
+	}
+
+	if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
+		printf("BAD MATRIX\n");
+
+	for (i = 0, r = 0; i < k; i++, r++) {
+		while (src_in_err[r]) {
+			r++;
+			continue;
+		}
+		recov[i] = buffs[r];
+	}
+
+	// Recover data
+	for (i = 0; i < nerrs; i++) {
+		for (j = 0; j < k; j++)
+			gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
+#ifndef USEREF
+		FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
+#else
+		gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
+#endif
+
+		if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
+			printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
+			printf("recov %d:", src_err_list[i]);
+			dump(temp_buff, 25);
+			printf("orig   :");
+			dump(buffs[src_err_list[i]], 25);
+			return -1;
+		}
+	}
+
+	// Do more random tests
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		gf_gen_rs_matrix(a, m, k);
+
+		// Make random data
+		for (i = 0; i < k; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		// Make parity vects
+		for (i = k; i < m; i++) {
+			for (j = 0; j < k; j++)
+				gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
+#ifndef USEREF
+			FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
+#else
+			gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
+#endif
+		}
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
+			err = 1 & rand();
+			src_in_err[i] = err;
+			if (err)
+				src_err_list[nerrs++] = i;
+		}
+		if (nerrs == 0) {	// should have at least one error
+			while ((err = (rand() % KMAX)) >= k) ;
+			src_err_list[nerrs++] = err;
+			src_in_err[err] = 1;
+		}
+		// construct b by removing error rows
+		for (i = 0, r = 0; i < k; i++, r++) {
+			while (src_in_err[r]) {
+				r++;
+				continue;
+			}
+			for (j = 0; j < k; j++)
+				b[k * i + j] = a[k * r + j];
+		}
+
+		if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
+			printf("BAD MATRIX\n");
+
+		for (i = 0, r = 0; i < k; i++, r++) {
+			while (src_in_err[r]) {
+				r++;
+				continue;
+			}
+			recov[i] = buffs[r];
+		}
+
+		// Recover data
+		for (i = 0; i < nerrs; i++) {
+			for (j = 0; j < k; j++)
+				gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
+#ifndef USEREF
+			FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
+#else
+			gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
+#endif
+			if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (i = 0; i < nerrs; i++)
+					printf(" %d", src_err_list[i]);
+				printf("\na:\n");
+				dump_u8xu8((u8 *) a, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((u8 *) d, k, k);
+				printf("orig data:\n");
+				dump_matrix(buffs, m, 25);
+				printf("orig   :");
+				dump(buffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_buff, 25);
+				return -1;
+			}
+		}
+		putchar('.');
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+	for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)	// Line up TEST_SIZE from end
+			efence_buffs[i] = buffs[i] + TEST_LEN - size;
+
+		for (i = 0; i < TEST_SOURCES; i++)
+			g[i] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)
+			gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref);
+		FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest);
+
+		if (0 != memcmp(dest_ref, dest, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 3\n");
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref, align);
+			printf("dprod:");
+			dump(dest, align);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test rand ptr alignment if available
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
+		srcs = rand() % TEST_SOURCES;
+		if (srcs == 0)
+			continue;
+
+		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
+		// Add random offsets
+		for (i = 0; i < srcs; i++)
+			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		udest_ptr = dest + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		memset(dest, 0, TEST_LEN);	// zero pad to check write-over
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				ubuffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++)
+			g[i] = rand();
+
+		for (i = 0; i < srcs; i++)
+			gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptr);
+
+		if (memcmp(dest_ref, udest_ptr, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref, 25);
+			printf("dprod:");
+			dump(udest_ptr, 25);
+			return -1;
+		}
+		// Confirm that padding around dests is unchanged
+		memset(dest_ref, 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
+		offset = udest_ptr - dest;
+
+		if (memcmp(dest, dest_ref, offset)) {
+			printf("Fail rand ualign pad start\n");
+			return -1;
+		}
+		if (memcmp(dest + offset + size, dest_ref, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad end\n");
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test all size alignment
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+
+	for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
+		srcs = TEST_SOURCES;
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++)
+			g[i] = rand();
+
+		for (i = 0; i < srcs; i++)
+			gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest);
+
+		if (memcmp(dest_ref, dest, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref, 25);
+			printf("dprod:");
+			dump(dest, 25);
+			return -1;
+		}
+	}
+
+	printf("done all: Pass\n");
+	return 0;
+}
--- a/erasure_code/gf_vect_dot_prod_base_test.c
+++ b/erasure_code/gf_vect_dot_prod_base_test.c
@ -0,0 +1,290 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  250
+#endif
+#ifndef RANDOMS
+# define RANDOMS 20
+#endif
+
+#define MMAX TEST_SOURCES
+#define KMAX TEST_SOURCES
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, m, k, nerrs, r, err;
+	void *buf;
+	u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
+	u8 *dest, *dest_ref, *temp_buff, *buffs[TEST_SOURCES];
+	u8 a[MMAX * KMAX], b[MMAX * KMAX], d[MMAX * KMAX];
+	u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
+
+	printf("gf_vect_dot_prod_base: %dx%d ", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	temp_buff = buf;
+
+	// Init
+	for (i = 0; i < TEST_SOURCES; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	memset(dest, 0, TEST_LEN);
+	memset(temp_buff, 0, TEST_LEN);
+	memset(dest_ref, 0, TEST_LEN);
+	memset(g, 0, TEST_SOURCES);
+
+	// Test erasure code using gf_vect_dot_prod
+	// Pick a first test
+	m = 9;
+	k = 5;
+	if (m > MMAX || k > KMAX)
+		return -1;
+
+	gf_gen_cauchy1_matrix(a, m, k);
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	// Make parity vects
+	for (i = k; i < m; i++) {
+		for (j = 0; j < k; j++)
+			gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
+
+		gf_vect_dot_prod_base(TEST_LEN, k, g_tbls, buffs, buffs[i]);
+	}
+
+	// Random buffers in erasure
+	memset(src_in_err, 0, TEST_SOURCES);
+	for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
+		err = 1 & rand();
+		src_in_err[i] = err;
+		if (err)
+			src_err_list[nerrs++] = i;
+	}
+
+	// construct b by removing error rows
+	for (i = 0, r = 0; i < k; i++, r++) {
+		while (src_in_err[r]) {
+			r++;
+			continue;
+		}
+		for (j = 0; j < k; j++)
+			b[k * i + j] = a[k * r + j];
+	}
+
+	if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
+		printf("BAD MATRIX\n");
+
+	for (i = 0, r = 0; i < k; i++, r++) {
+		while (src_in_err[r]) {
+			r++;
+			continue;
+		}
+		recov[i] = buffs[r];
+	}
+
+	// Recover data
+	for (i = 0; i < nerrs; i++) {
+		for (j = 0; j < k; j++)
+			gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
+
+		gf_vect_dot_prod_base(TEST_LEN, k, g_tbls, recov, temp_buff);
+
+		if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
+			printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
+			printf("recov %d:", src_err_list[i]);
+			dump(temp_buff, 25);
+			printf("orig   :");
+			dump(buffs[src_err_list[i]], 25);
+			return -1;
+		}
+	}
+
+	// Do more random tests
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		gf_gen_cauchy1_matrix(a, m, k);
+
+		// Make random data
+		for (i = 0; i < k; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		// Make parity vects
+		for (i = k; i < m; i++) {
+			for (j = 0; j < k; j++)
+				gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
+
+			gf_vect_dot_prod_base(TEST_LEN, k, g_tbls, buffs, buffs[i]);
+		}
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
+			err = 1 & rand();
+			src_in_err[i] = err;
+			if (err)
+				src_err_list[nerrs++] = i;
+		}
+		if (nerrs == 0) {	// should have at least one error
+			while ((err = (rand() % KMAX)) >= k) ;
+			src_err_list[nerrs++] = err;
+			src_in_err[err] = 1;
+		}
+		// construct b by removing error rows
+		for (i = 0, r = 0; i < k; i++, r++) {
+			while (src_in_err[r]) {
+				r++;
+				continue;
+			}
+			for (j = 0; j < k; j++)
+				b[k * i + j] = a[k * r + j];
+		}
+
+		if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
+			printf("BAD MATRIX\n");
+
+		for (i = 0, r = 0; i < k; i++, r++) {
+			while (src_in_err[r]) {
+				r++;
+				continue;
+			}
+			recov[i] = buffs[r];
+		}
+
+		// Recover data
+		for (i = 0; i < nerrs; i++) {
+			for (j = 0; j < k; j++)
+				gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
+
+			gf_vect_dot_prod_base(TEST_LEN, k, g_tbls, recov, temp_buff);
+
+			if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (i = 0; i < nerrs; i++)
+					printf(" %d", src_err_list[i]);
+				printf("\na:\n");
+				dump_u8xu8((u8 *) a, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((u8 *) d, k, k);
+				printf("orig data:\n");
+				dump_matrix(buffs, m, 25);
+				printf("orig   :");
+				dump(buffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_buff, 25);
+				return -1;
+			}
+		}
+		putchar('.');
+	}
+
+	printf("done all: Pass\n");
+	return 0;
+}
--- a/erasure_code/gf_vect_dot_prod_perf.c
+++ b/erasure_code/gf_vect_dot_prod_perf.c
@ -0,0 +1,184 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "test.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_vect_dot_prod
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   40000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS 1000
+#  endif
+# endif
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j;
+	void *buf;
+	u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], *dest, *dest_ref;
+	u8 *temp_buff, *buffs[TEST_SOURCES];
+	struct perf start, stop;
+
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	temp_buff = buf;
+
+	// Performance test
+	for (i = 0; i < TEST_SOURCES; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	memset(dest, 0, TEST_LEN);
+	memset(temp_buff, 0, TEST_LEN);
+	memset(dest_ref, 0, TEST_LEN);
+	memset(g, 0, TEST_SOURCES);
+
+	for (i = 0; i < TEST_SOURCES; i++)
+		g[i] = rand();
+
+	for (j = 0; j < TEST_SOURCES; j++)
+		gf_vect_mul_init(g[j], &g_tbls[j * 32]);
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
+
+#ifdef DO_REF_PERF
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		for (j = 0; j < TEST_SOURCES; j++)
+			gf_vect_mul_init(g[j], &g_tbls[j * 32]);
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
+	}
+	perf_stop(&stop);
+	printf("gf_vect_dot_prod_base" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
+#endif
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		for (j = 0; j < TEST_SOURCES; j++)
+			gf_vect_mul_init(g[j], &g_tbls[j * 32]);
+
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
+
+	if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref, 25);
+		printf("dprod:");
+		dump(dest, 25);
+		return -1;
+	}
+
+	printf("pass perf check\n");
+	return 0;
+}
--- a/erasure_code/gf_vect_dot_prod_sse.asm
+++ b/erasure_code/gf_vect_dot_prod_sse.asm
@ -0,0 +1,271 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_dot_prod_sse(len, vec, *g_tbls, **buffs, *dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+
+ %define tmp   r11
+ %define tmp2  r10
+ %define tmp3  r9
+ %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
+ %define PS 8
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+
+ %define arg4   r12 		; must be saved and loaded
+ %define tmp    r11
+ %define tmp2   r10
+ %define tmp3   rdi 		; must be saved and loaded
+ %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
+ %define PS 8
+ %define frame_size 2*8
+ %define arg(x)      [rsp + frame_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	rex_push_reg	r12
+	push_reg	rdi
+	end_prolog
+	mov	arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	rdi
+	pop	r12
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+
+ %define trans   ecx			;trans is for the variables in stack
+ %define arg0    trans
+ %define arg0_m  arg(0)
+ %define arg1    trans
+ %define arg1_m  arg(1)
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    ebx
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    esi
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg3, arg(3)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
+%define len   arg0
+%define vec   arg1
+%define mul_array arg2
+%define	src   arg3
+%define dest  arg4
+
+%define vec_i tmp2
+%define ptr   tmp3
+%define pos   return
+
+ %ifidn PS,4				;32-bit code
+	%define  vec_m 	arg1_m
+	%define  len_m 	arg0_m
+	%define  dest_m arg4_m
+ %endif
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+%ifidn PS,8				;64-bit code
+ default rel
+  [bits 64]
+%endif
+
+section .text
+
+%define xmask0f  xmm5
+%define xgft_lo  xmm4
+%define xgft_hi  xmm3
+
+%define x0     xmm0
+%define xtmpa  xmm1
+%define xp     xmm2
+
+align 16
+global gf_vect_dot_prod_sse:function
+func(gf_vect_dot_prod_sse)
+	FUNC_SAVE
+	SLDR 	len, len_m
+	sub	len, 16
+	SSTR 	len_m, len
+	jl	.return_fail
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+.loop16:
+	pxor	xp, xp
+	mov	tmp, mul_array
+	xor	vec_i, vec_i
+
+.next_vect:
+
+	mov	ptr, [src+vec_i*PS]
+	movdqu	xgft_lo, [tmp]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	movdqu	xgft_hi, [tmp+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	XLDR	x0, [ptr+pos]		;Get next source vector
+
+	add	tmp, 32
+	add	vec_i, 1
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	pshufb	xgft_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft_hi, xgft_lo	;GF add high and low partials
+	pxor	xp, xgft_hi		;xp += partial
+
+	SLDR 	vec, vec_m
+	cmp	vec_i, vec
+	jl	.next_vect
+
+	SLDR 	dest, dest_m
+	XSTR	[dest+pos], xp
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	SLDR 	len, len_m
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	jmp	.loop16		;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func                 core, ver, snum
+slversion gf_vect_dot_prod_sse, 00,  05,  0060
--- a/erasure_code/gf_vect_dot_prod_sse_perf.c
+++ b/erasure_code/gf_vect_dot_prod_sse_perf.c
@ -0,0 +1,184 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "test.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_vect_dot_prod_sse
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   40000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS 1000
+#  endif
+# endif
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j;
+	void *buf;
+	u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], *dest, *dest_ref;
+	u8 *temp_buff, *buffs[TEST_SOURCES];
+	struct perf start, stop;
+
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	temp_buff = buf;
+
+	// Performance test
+	for (i = 0; i < TEST_SOURCES; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	memset(dest, 0, TEST_LEN);
+	memset(temp_buff, 0, TEST_LEN);
+	memset(dest_ref, 0, TEST_LEN);
+	memset(g, 0, TEST_SOURCES);
+
+	for (i = 0; i < TEST_SOURCES; i++)
+		g[i] = rand();
+
+	for (j = 0; j < TEST_SOURCES; j++)
+		gf_vect_mul_init(g[j], &g_tbls[j * 32]);
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
+
+#ifdef DO_REF_PERF
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		for (j = 0; j < TEST_SOURCES; j++)
+			gf_vect_mul_init(g[j], &g_tbls[j * 32]);
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
+	}
+	perf_stop(&stop);
+	printf("gf_vect_dot_prod_base" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
+#endif
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
+
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		for (j = 0; j < TEST_SOURCES; j++)
+			gf_vect_mul_init(g[j], &g_tbls[j * 32]);
+
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
+
+	if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test\n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref, 25);
+		printf("dprod:");
+		dump(dest, 25);
+		return -1;
+	}
+
+	printf("pass perf check\n");
+	return 0;
+}
--- a/erasure_code/gf_vect_dot_prod_sse_test.c
+++ b/erasure_code/gf_vect_dot_prod_sse_test.c
@ -0,0 +1,525 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_vect_dot_prod_sse
+#endif
+#ifndef TEST_MIN_SIZE
+# define TEST_MIN_SIZE  16
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  16
+#endif
+#ifndef RANDOMS
+# define RANDOMS 20
+#endif
+
+#define MMAX TEST_SOURCES
+#define KMAX TEST_SOURCES
+
+#ifdef EC_ALIGNED_ADDR
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 0
+# define LEN_ALIGN_CHK_B 0	// 0 for aligned only
+#else
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 32
+# define LEN_ALIGN_CHK_B 32	// 0 for aligned only
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, srcs, m, k, nerrs, r, err;
+	void *buf;
+	u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
+	u8 *dest, *dest_ref, *temp_buff, *buffs[TEST_SOURCES];
+	u8 a[MMAX * KMAX], b[MMAX * KMAX], d[MMAX * KMAX];
+	u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
+
+	int align, size;
+	unsigned char *efence_buffs[TEST_SOURCES];
+	unsigned int offset;
+	u8 *ubuffs[TEST_SOURCES];
+	u8 *udest_ptr;
+
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	temp_buff = buf;
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	memset(dest, 0, TEST_LEN);
+	memset(temp_buff, 0, TEST_LEN);
+	memset(dest_ref, 0, TEST_LEN);
+	memset(g, 0, TEST_SOURCES);
+
+	for (i = 0; i < TEST_SOURCES; i++)
+		gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
+
+	if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " \n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref, 25);
+		printf("dprod:");
+		dump(dest, 25);
+		return -1;
+	} else
+		putchar('.');
+
+	// Rand data test
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)
+			g[i] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)
+			gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
+
+		if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " 1\n");
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref, 25);
+			printf("dprod:");
+			dump(dest, 25);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Rand data test with varied parameters
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
+			for (i = 0; i < srcs; i++)
+				for (j = 0; j < TEST_LEN; j++)
+					buffs[i][j] = rand();
+
+			for (i = 0; i < srcs; i++)
+				g[i] = rand();
+
+			for (i = 0; i < srcs; i++)
+				gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref);
+			FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest);
+
+			if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 2\n");
+				dump_matrix(buffs, 5, srcs);
+				printf("dprod_base:");
+				dump(dest_ref, 5);
+				printf("dprod:");
+				dump(dest, 5);
+				return -1;
+			}
+
+			putchar('.');
+		}
+	}
+
+	// Test erasure code using gf_vect_dot_prod
+
+	// Pick a first test
+	m = 9;
+	k = 5;
+	if (m > MMAX || k > KMAX)
+		return -1;
+
+	gf_gen_rs_matrix(a, m, k);
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	// Make parity vects
+	for (i = k; i < m; i++) {
+		for (j = 0; j < k; j++)
+			gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
+#ifndef USEREF
+		FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
+#else
+		gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
+#endif
+	}
+
+	// Random buffers in erasure
+	memset(src_in_err, 0, TEST_SOURCES);
+	for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
+		err = 1 & rand();
+		src_in_err[i] = err;
+		if (err)
+			src_err_list[nerrs++] = i;
+	}
+
+	// construct b by removing error rows
+	for (i = 0, r = 0; i < k; i++, r++) {
+		while (src_in_err[r]) {
+			r++;
+			continue;
+		}
+		for (j = 0; j < k; j++)
+			b[k * i + j] = a[k * r + j];
+	}
+
+	if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
+		printf("BAD MATRIX\n");
+
+	for (i = 0, r = 0; i < k; i++, r++) {
+		while (src_in_err[r]) {
+			r++;
+			continue;
+		}
+		recov[i] = buffs[r];
+	}
+
+	// Recover data
+	for (i = 0; i < nerrs; i++) {
+		for (j = 0; j < k; j++)
+			gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
+#ifndef USEREF
+		FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
+#else
+		gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
+#endif
+
+		if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
+			printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
+			printf("recov %d:", src_err_list[i]);
+			dump(temp_buff, 25);
+			printf("orig   :");
+			dump(buffs[src_err_list[i]], 25);
+			return -1;
+		}
+	}
+
+	// Do more random tests
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		gf_gen_rs_matrix(a, m, k);
+
+		// Make random data
+		for (i = 0; i < k; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		// Make parity vects
+		for (i = k; i < m; i++) {
+			for (j = 0; j < k; j++)
+				gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
+#ifndef USEREF
+			FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
+#else
+			gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
+#endif
+		}
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
+			err = 1 & rand();
+			src_in_err[i] = err;
+			if (err)
+				src_err_list[nerrs++] = i;
+		}
+		if (nerrs == 0) {	// should have at least one error
+			while ((err = (rand() % KMAX)) >= k) ;
+			src_err_list[nerrs++] = err;
+			src_in_err[err] = 1;
+		}
+		// construct b by removing error rows
+		for (i = 0, r = 0; i < k; i++, r++) {
+			while (src_in_err[r]) {
+				r++;
+				continue;
+			}
+			for (j = 0; j < k; j++)
+				b[k * i + j] = a[k * r + j];
+		}
+
+		if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
+			printf("BAD MATRIX\n");
+
+		for (i = 0, r = 0; i < k; i++, r++) {
+			while (src_in_err[r]) {
+				r++;
+				continue;
+			}
+			recov[i] = buffs[r];
+		}
+
+		// Recover data
+		for (i = 0; i < nerrs; i++) {
+			for (j = 0; j < k; j++)
+				gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
+#ifndef USEREF
+			FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
+#else
+			gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
+#endif
+			if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (i = 0; i < nerrs; i++)
+					printf(" %d", src_err_list[i]);
+				printf("\na:\n");
+				dump_u8xu8((u8 *) a, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((u8 *) d, k, k);
+				printf("orig data:\n");
+				dump_matrix(buffs, m, 25);
+				printf("orig   :");
+				dump(buffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_buff, 25);
+				return -1;
+			}
+		}
+		putchar('.');
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+	for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)	// Line up TEST_SIZE from end
+			efence_buffs[i] = buffs[i] + TEST_LEN - size;
+
+		for (i = 0; i < TEST_SOURCES; i++)
+			g[i] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)
+			gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref);
+		FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest);
+
+		if (0 != memcmp(dest_ref, dest, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 3\n");
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref, align);
+			printf("dprod:");
+			dump(dest, align);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test rand ptr alignment if available
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
+		srcs = rand() % TEST_SOURCES;
+		if (srcs == 0)
+			continue;
+
+		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
+		// Add random offsets
+		for (i = 0; i < srcs; i++)
+			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		udest_ptr = dest + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		memset(dest, 0, TEST_LEN);	// zero pad to check write-over
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				ubuffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++)
+			g[i] = rand();
+
+		for (i = 0; i < srcs; i++)
+			gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptr);
+
+		if (memcmp(dest_ref, udest_ptr, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref, 25);
+			printf("dprod:");
+			dump(udest_ptr, 25);
+			return -1;
+		}
+		// Confirm that padding around dests is unchanged
+		memset(dest_ref, 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
+		offset = udest_ptr - dest;
+
+		if (memcmp(dest, dest_ref, offset)) {
+			printf("Fail rand ualign pad start\n");
+			return -1;
+		}
+		if (memcmp(dest + offset + size, dest_ref, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad end\n");
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test all size alignment
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+
+	for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
+		srcs = TEST_SOURCES;
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++)
+			g[i] = rand();
+
+		for (i = 0; i < srcs; i++)
+			gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest);
+
+		if (memcmp(dest_ref, dest, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref, 25);
+			printf("dprod:");
+			dump(dest, 25);
+			return -1;
+		}
+	}
+
+	printf("done all: Pass\n");
+	return 0;
+}
--- a/erasure_code/gf_vect_dot_prod_test.c
+++ b/erasure_code/gf_vect_dot_prod_test.c
@ -0,0 +1,525 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_vect_dot_prod
+#endif
+#ifndef TEST_MIN_SIZE
+# define TEST_MIN_SIZE  32
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  16
+#endif
+#ifndef RANDOMS
+# define RANDOMS 20
+#endif
+
+#define MMAX TEST_SOURCES
+#define KMAX TEST_SOURCES
+
+#ifdef EC_ALIGNED_ADDR
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 0
+# define LEN_ALIGN_CHK_B 0	// 0 for aligned only
+#else
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 32
+# define LEN_ALIGN_CHK_B 32	// 0 for aligned only
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, srcs, m, k, nerrs, r, err;
+	void *buf;
+	u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
+	u8 *dest, *dest_ref, *temp_buff, *buffs[TEST_SOURCES];
+	u8 a[MMAX * KMAX], b[MMAX * KMAX], d[MMAX * KMAX];
+	u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
+
+	int align, size;
+	unsigned char *efence_buffs[TEST_SOURCES];
+	unsigned int offset;
+	u8 *ubuffs[TEST_SOURCES];
+	u8 *udest_ptr;
+
+	printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	dest_ref = buf;
+
+	if (posix_memalign(&buf, 64, TEST_LEN)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	temp_buff = buf;
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	memset(dest, 0, TEST_LEN);
+	memset(temp_buff, 0, TEST_LEN);
+	memset(dest_ref, 0, TEST_LEN);
+	memset(g, 0, TEST_SOURCES);
+
+	for (i = 0; i < TEST_SOURCES; i++)
+		gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+	gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
+
+	FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
+
+	if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
+		printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " \n");
+		dump_matrix(buffs, 5, TEST_SOURCES);
+		printf("dprod_base:");
+		dump(dest_ref, 25);
+		printf("dprod:");
+		dump(dest, 25);
+		return -1;
+	} else
+		putchar('.');
+
+	// Rand data test
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)
+			g[i] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)
+			gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
+
+		if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " 1\n");
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref, 25);
+			printf("dprod:");
+			dump(dest, 25);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Rand data test with varied parameters
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
+			for (i = 0; i < srcs; i++)
+				for (j = 0; j < TEST_LEN; j++)
+					buffs[i][j] = rand();
+
+			for (i = 0; i < srcs; i++)
+				g[i] = rand();
+
+			for (i = 0; i < srcs; i++)
+				gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+			gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref);
+			FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest);
+
+			if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 2\n");
+				dump_matrix(buffs, 5, srcs);
+				printf("dprod_base:");
+				dump(dest_ref, 5);
+				printf("dprod:");
+				dump(dest, 5);
+				return -1;
+			}
+
+			putchar('.');
+		}
+	}
+
+	// Test erasure code using gf_vect_dot_prod
+
+	// Pick a first test
+	m = 9;
+	k = 5;
+	if (m > MMAX || k > KMAX)
+		return -1;
+
+	gf_gen_rs_matrix(a, m, k);
+
+	// Make random data
+	for (i = 0; i < k; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	// Make parity vects
+	for (i = k; i < m; i++) {
+		for (j = 0; j < k; j++)
+			gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
+#ifndef USEREF
+		FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
+#else
+		gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
+#endif
+	}
+
+	// Random buffers in erasure
+	memset(src_in_err, 0, TEST_SOURCES);
+	for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
+		err = 1 & rand();
+		src_in_err[i] = err;
+		if (err)
+			src_err_list[nerrs++] = i;
+	}
+
+	// construct b by removing error rows
+	for (i = 0, r = 0; i < k; i++, r++) {
+		while (src_in_err[r]) {
+			r++;
+			continue;
+		}
+		for (j = 0; j < k; j++)
+			b[k * i + j] = a[k * r + j];
+	}
+
+	if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
+		printf("BAD MATRIX\n");
+
+	for (i = 0, r = 0; i < k; i++, r++) {
+		while (src_in_err[r]) {
+			r++;
+			continue;
+		}
+		recov[i] = buffs[r];
+	}
+
+	// Recover data
+	for (i = 0; i < nerrs; i++) {
+		for (j = 0; j < k; j++)
+			gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
+#ifndef USEREF
+		FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
+#else
+		gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
+#endif
+
+		if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
+			printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
+			printf("recov %d:", src_err_list[i]);
+			dump(temp_buff, 25);
+			printf("orig   :");
+			dump(buffs[src_err_list[i]], 25);
+			return -1;
+		}
+	}
+
+	// Do more random tests
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		while ((m = (rand() % MMAX)) < 2) ;
+		while ((k = (rand() % KMAX)) >= m || k < 1) ;
+
+		if (m > MMAX || k > KMAX)
+			continue;
+
+		gf_gen_rs_matrix(a, m, k);
+
+		// Make random data
+		for (i = 0; i < k; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		// Make parity vects
+		for (i = k; i < m; i++) {
+			for (j = 0; j < k; j++)
+				gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
+#ifndef USEREF
+			FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
+#else
+			gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
+#endif
+		}
+
+		// Random errors
+		memset(src_in_err, 0, TEST_SOURCES);
+		for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
+			err = 1 & rand();
+			src_in_err[i] = err;
+			if (err)
+				src_err_list[nerrs++] = i;
+		}
+		if (nerrs == 0) {	// should have at least one error
+			while ((err = (rand() % KMAX)) >= k) ;
+			src_err_list[nerrs++] = err;
+			src_in_err[err] = 1;
+		}
+		// construct b by removing error rows
+		for (i = 0, r = 0; i < k; i++, r++) {
+			while (src_in_err[r]) {
+				r++;
+				continue;
+			}
+			for (j = 0; j < k; j++)
+				b[k * i + j] = a[k * r + j];
+		}
+
+		if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
+			printf("BAD MATRIX\n");
+
+		for (i = 0, r = 0; i < k; i++, r++) {
+			while (src_in_err[r]) {
+				r++;
+				continue;
+			}
+			recov[i] = buffs[r];
+		}
+
+		// Recover data
+		for (i = 0; i < nerrs; i++) {
+			for (j = 0; j < k; j++)
+				gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
+#ifndef USEREF
+			FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
+#else
+			gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
+#endif
+			if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
+				printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
+				printf(" - erase list = ");
+				for (i = 0; i < nerrs; i++)
+					printf(" %d", src_err_list[i]);
+				printf("\na:\n");
+				dump_u8xu8((u8 *) a, m, k);
+				printf("inv b:\n");
+				dump_u8xu8((u8 *) d, k, k);
+				printf("orig data:\n");
+				dump_matrix(buffs, m, 25);
+				printf("orig   :");
+				dump(buffs[src_err_list[i]], 25);
+				printf("recov %d:", src_err_list[i]);
+				dump(temp_buff, 25);
+				return -1;
+			}
+		}
+		putchar('.');
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+	for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)	// Line up TEST_SIZE from end
+			efence_buffs[i] = buffs[i] + TEST_LEN - size;
+
+		for (i = 0; i < TEST_SOURCES; i++)
+			g[i] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)
+			gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+		gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref);
+		FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest);
+
+		if (0 != memcmp(dest_ref, dest, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 3\n");
+			dump_matrix(efence_buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref, align);
+			printf("dprod:");
+			dump(dest, align);
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test rand ptr alignment if available
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
+		srcs = rand() % TEST_SOURCES;
+		if (srcs == 0)
+			continue;
+
+		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
+		// Add random offsets
+		for (i = 0; i < srcs; i++)
+			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		udest_ptr = dest + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		memset(dest, 0, TEST_LEN);	// zero pad to check write-over
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				ubuffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++)
+			g[i] = rand();
+
+		for (i = 0; i < srcs; i++)
+			gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptr);
+
+		if (memcmp(dest_ref, udest_ptr, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign srcs=%d\n",
+			       srcs);
+			dump_matrix(ubuffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref, 25);
+			printf("dprod:");
+			dump(udest_ptr, 25);
+			return -1;
+		}
+		// Confirm that padding around dests is unchanged
+		memset(dest_ref, 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
+		offset = udest_ptr - dest;
+
+		if (memcmp(dest, dest_ref, offset)) {
+			printf("Fail rand ualign pad start\n");
+			return -1;
+		}
+		if (memcmp(dest + offset + size, dest_ref, PTR_ALIGN_CHK_B - offset)) {
+			printf("Fail rand ualign pad end\n");
+			return -1;
+		}
+
+		putchar('.');
+	}
+
+	// Test all size alignment
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
+
+	for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
+		srcs = TEST_SOURCES;
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < srcs; i++)
+			g[i] = rand();
+
+		for (i = 0; i < srcs; i++)
+			gf_vect_mul_init(g[i], &g_tbls[i * 32]);
+
+		gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref);
+
+		FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest);
+
+		if (memcmp(dest_ref, dest, size)) {
+			printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign len=%d\n",
+			       size);
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref, 25);
+			printf("dprod:");
+			dump(dest, 25);
+			return -1;
+		}
+	}
+
+	printf("done all: Pass\n");
+	return 0;
+}
--- a/erasure_code/gf_vect_mad_avx.asm
+++ b/erasure_code/gf_vect_mad_avx.asm
@ -0,0 +1,196 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+ %define PS 8
+ %define stack_size 16*3 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	save_reg	r12,  3*16 + 0*8
+	save_reg	r15,  3*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	mov	r12,  [rsp + 3*16 + 0*8]
+	mov	r15,  [rsp + 3*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest  arg5
+%define pos   return
+%define pos.w return.w
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm8
+%define xgft_lo  xmm7
+%define xgft_hi  xmm6
+
+%define x0     xmm0
+%define xtmpa  xmm1
+%define xtmph  xmm2
+%define xtmpl  xmm3
+%define xd     xmm4
+%define xtmpd  xmm5
+
+align 16
+global gf_vect_mad_avx:function
+func(gf_vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	sal	vec_i, 5		;Multiply by 32
+	vmovdqu	xgft_lo, [vec_i+mul_array]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xgft_hi, [vec_i+mul_array+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+
+	XLDR	xtmpd, [dest+len]	;backup the last 16 bytes in dest
+
+.loop16:
+	XLDR	xd, [dest+pos]		;Get next dest vector
+.loop16_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph, xgft_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl ;GF add high and low partials
+	vpxor	xd, xd, xtmph		;xd += partial
+
+	XSTR	[dest+pos], xd
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	vmovdqa	xd, xtmpd	;Restore xd
+	jmp	.loop16_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func            core, ver, snum
+slversion gf_vect_mad_avx, 02,  01,  0201
--- a/erasure_code/gf_vect_mad_avx2.asm
+++ b/erasure_code/gf_vect_mad_avx2.asm
@ -0,0 +1,203 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg0.w ecx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+ %define arg4   r12 		; must be saved and loaded
+ %define arg5   r15
+
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define return rax
+ %define return.w eax
+ %define PS 8
+ %define stack_size 16*3 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	save_reg	r12,  3*16 + 0*8
+	save_reg	r15,  3*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	mov	r12,  [rsp + 3*16 + 0*8]
+	mov	r15,  [rsp + 3*16 + 1*8]
+	add	rsp, stack_size
+ %endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp      r11
+ %define tmp.w    r11d
+ %define tmp.b    r11b
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+
+;;; gf_vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec   arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest  arg5
+%define pos   return
+%define pos.w return.w
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  ymm8
+%define xmask0fx xmm8
+%define xgft_lo  ymm7
+%define xgft_hi  ymm6
+
+%define x0     ymm0
+%define xtmpa  ymm1
+%define xtmph  ymm2
+%define xtmpl  ymm3
+%define xd     ymm4
+%define xtmpd  ymm5
+
+align 16
+global gf_vect_mad_avx2:function
+func(gf_vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	vmovdqu	xgft_lo, [vec_i+mul_array]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+						; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
+
+	XLDR	xtmpd, [dest+len]	;backup the last 32 bytes in dest
+
+.loop32:
+	XLDR	xd, [dest+pos]		;Get next dest vector
+.loop32_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph, xgft_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl ;GF add high and low partials
+	vpxor	xd, xd, xtmph		;xd += partial
+
+	XSTR	[dest+pos], xd
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-32
+	vmovdqa	xd, xtmpd	;Restore xd
+	jmp	.loop32_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+;;;       func             core, ver, snum
+slversion gf_vect_mad_avx2, 04,  01,  0202
--- a/erasure_code/gf_vect_mad_perf.c
+++ b/erasure_code/gf_vect_mad_perf.c
@ -0,0 +1,374 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+#include "test.h"
+
+//By default, test sse version
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_4vect_mad_sse
+# define REF_FUNCTION gf_4vect_dot_prod_sse
+# define VECT 4
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_SOURCES 10
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   40000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
+#  define TEST_LOOPS   100
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS 1000
+#  endif
+# endif
+#endif
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, l;
+	void *buf;
+	u8 gf[6][TEST_SOURCES];
+	u8 *g_tbls;
+	u8 *dest_ref[VECT];
+	u8 *dest_ptrs[VECT], *buffs[TEST_SOURCES];
+	u8 *dest_perf_ptrs[VECT];
+	struct perf start, stop;
+
+	printf("test " xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 16, VECT * TEST_SOURCES * 32)) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	g_tbls = buf;
+
+	for (i = 0; i < VECT; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		dest_ptrs[i] = buf;
+		memset(dest_ptrs[i], 0, TEST_LEN);
+	}
+
+	for (i = 0; i < VECT; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		dest_ref[i] = buf;
+		memset(dest_ref[i], 0, TEST_LEN);
+	}
+
+	for (i = 0; i < VECT; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		dest_perf_ptrs[i] = buf;
+		memset(dest_perf_ptrs[i], 0, TEST_LEN);
+	}
+
+	// Performance test
+	for (i = 0; i < TEST_SOURCES; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	for (i = 0; i < VECT; i++)
+		for (j = 0; j < TEST_SOURCES; j++) {
+			gf[i][j] = rand();
+			gf_vect_mul_init(gf[i][j], &g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
+		}
+
+	for (i = 0; i < VECT; i++)
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[i * 32 * TEST_SOURCES],
+				      buffs, dest_ref[i]);
+
+	for (i = 0; i < VECT; i++)
+		memset(dest_ptrs[i], 0, TEST_LEN);
+	for (i = 0; i < TEST_SOURCES; i++) {
+#if (VECT == 1)
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], *dest_ptrs);
+#else
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], dest_ptrs);
+#endif
+	}
+	for (i = 0; i < VECT; i++) {
+		if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
+			printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref[i], 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[i], 25);
+			return -1;
+		}
+	}
+
+#if (VECT == 1)
+	REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, *dest_ref);
+#else
+	REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ref);
+#endif
+	for (i = 0; i < VECT; i++) {
+		if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
+			printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
+			dump_matrix(buffs, 5, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref[i], 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[i], 25);
+			return -1;
+		}
+	}
+
+#ifdef DO_REF_PERF
+
+#if (VECT == 1)
+	REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, *dest_ref);
+#else
+	REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ref);
+#endif
+	perf_start(&start);
+	for (l = 0; l < TEST_LOOPS; l++) {
+		for (j = 0; j < TEST_SOURCES; j++) {
+#if (VECT == 1)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+#elif (VECT == 2)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+#elif (VECT == 3)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+#elif (VECT == 4)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+#elif (VECT == 5)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
+#elif (VECT == 6)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[5][j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
+#endif
+		}
+
+#if (VECT == 1)
+		REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, *dest_ref);
+#else
+		REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ref);
+#endif
+	}
+	perf_stop(&stop);
+	printf(xstr(REF_FUNCTION) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + VECT) * TEST_LOOPS);
+
+#endif
+
+	for (i = 0; i < TEST_SOURCES; i++) {
+#if (VECT == 1)
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
+				    *dest_perf_ptrs);
+#else
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
+				    dest_perf_ptrs);
+#endif
+	}
+	perf_start(&start);
+	for (l = 0; l < TEST_LOOPS; l++) {
+		for (j = 0; j < TEST_SOURCES; j++) {
+#if (VECT == 1)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+#elif (VECT == 2)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+#elif (VECT == 3)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+#elif (VECT == 4)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+#elif (VECT == 5)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
+#elif (VECT == 6)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[5][j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
+#endif
+		}
+		for (i = 0; i < TEST_SOURCES; i++) {
+#if (VECT == 1)
+			FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
+					    *dest_perf_ptrs);
+#else
+			FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
+					    dest_perf_ptrs);
+#endif
+		}
+
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + VECT) * TEST_LOOPS);
+
+	perf_start(&start);
+	for (l = 0; l < TEST_LOOPS; l++) {
+		for (j = 0; j < TEST_SOURCES; j++) {
+#if (VECT == 1)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+#elif (VECT == 2)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+#elif (VECT == 3)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+#elif (VECT == 4)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+#elif (VECT == 5)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
+#elif (VECT == 6)
+			gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
+			gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
+			gf_vect_mul_init(gf[5][j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
+#endif
+		}
+#if (VECT == 1)
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, 0, g_tbls, buffs[0],
+				    *dest_perf_ptrs);
+#else
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, 0, g_tbls, buffs[0],
+				    dest_perf_ptrs);
+#endif
+
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) "_single_src" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (1 + VECT) * TEST_LOOPS);
+
+	perf_start(&start);
+	for (l = 0; l < TEST_LOOPS; l++) {
+#if (VECT == 1)
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, 0, g_tbls, buffs[0],
+				    *dest_perf_ptrs);
+#else
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, 0, g_tbls, buffs[0],
+				    dest_perf_ptrs);
+#endif
+
+	}
+	perf_stop(&stop);
+	printf(xstr(FUNCTION_UNDER_TEST) "_single_src_simple" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * (1 + VECT) * TEST_LOOPS);
+
+	printf("pass perf check\n");
+	return 0;
+
+}
--- a/erasure_code/gf_vect_mad_sse.asm
+++ b/erasure_code/gf_vect_mad_sse.asm
@ -0,0 +1,197 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+ %define PS 8
+ %define stack_size 16*3 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	save_reg	r12,  3*16 + 0*8
+	save_reg	r15,  3*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	mov	r12,  [rsp + 3*16 + 0*8]
+	mov	r15,  [rsp + 3*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest  arg5
+%define pos   return
+%define pos.w return.w
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm8
+%define xgft_lo  xmm7
+%define xgft_hi  xmm6
+
+%define x0     xmm0
+%define xtmpa  xmm1
+%define xtmph  xmm2
+%define xtmpl  xmm3
+%define xd     xmm4
+%define xtmpd  xmm5
+
+
+align 16
+global gf_vect_mad_sse:function
+func(gf_vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec_i, 5		;Multiply by 32
+	movdqu	xgft_lo, [vec_i+mul_array]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xgft_hi, [vec_i+mul_array+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+
+	XLDR	xtmpd, [dest+len]	;backup the last 16 bytes in dest
+
+.loop16:
+	XLDR	xd, [dest+pos]		;Get next dest vector
+.loop16_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+	movdqa	xtmph, xgft_hi		;Reload const array registers
+	movdqa	xtmpl, xgft_lo
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+	pshufb	xtmph, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph, xtmpl		;GF add high and low partials
+
+	pxor	xd, xtmph
+	XSTR	[dest+pos], xd		;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	movdqa	xd, xtmpd	;Restore xd
+	jmp	.loop16_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func            core, ver, snum
+slversion gf_vect_mad_sse, 00,  01,  0200
--- a/erasure_code/gf_vect_mad_test.c
+++ b/erasure_code/gf_vect_mad_test.c
@ -0,0 +1,508 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset, memcmp
+#include "erasure_code.h"
+#include "types.h"
+
+#ifndef ALIGN_SIZE
+# define ALIGN_SIZE 16
+#endif
+
+//By default, test sse version
+#ifndef FUNCTION_UNDER_TEST
+# define FUNCTION_UNDER_TEST gf_6vect_mad_sse
+# define REF_FUNCTION gf_6vect_dot_prod_sse
+# define VECT 6
+#endif
+
+#ifndef TEST_MIN_SIZE
+# define TEST_MIN_SIZE  ALIGN_SIZE
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define TEST_LEN 8192
+#define TEST_SIZE (TEST_LEN/2)
+#define TEST_MEM  TEST_SIZE
+#define TEST_LOOPS 20000
+#define TEST_TYPE_STR ""
+
+#ifndef TEST_SOURCES
+# define TEST_SOURCES  16
+#endif
+#ifndef RANDOMS
+# define RANDOMS 20
+#endif
+
+#ifdef EC_ALIGNED_ADDR
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B 0
+# define LEN_ALIGN_CHK_B 0	// 0 for aligned only
+#else
+// Define power of 2 range to check ptr, len alignment
+# define PTR_ALIGN_CHK_B ALIGN_SIZE
+# define LEN_ALIGN_CHK_B ALIGN_SIZE	// 0 for aligned only
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+typedef unsigned char u8;
+
+void dump(unsigned char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len;) {
+		printf(" %2x", 0xff & buf[i++]);
+		if (i % 32 == 0)
+			printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_matrix(unsigned char **s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", s[i][j]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+void dump_u8xu8(unsigned char *s, int k, int m)
+{
+	int i, j;
+	for (i = 0; i < k; i++) {
+		for (j = 0; j < m; j++) {
+			printf(" %2x", 0xff & s[j + (i * m)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int i, j, rtest, srcs;
+	void *buf;
+	u8 gf[6][TEST_SOURCES];
+	u8 *g_tbls;
+	u8 *dest_ref[VECT];
+	u8 *dest_ptrs[VECT], *buffs[TEST_SOURCES];
+	int vector = VECT;
+
+	int align, size;
+	unsigned char *efence_buffs[TEST_SOURCES];
+	unsigned int offset;
+	u8 *ubuffs[TEST_SOURCES];
+	u8 *udest_ptrs[VECT];
+	printf("test" xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
+
+	// Allocate the arrays
+	for (i = 0; i < TEST_SOURCES; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		buffs[i] = buf;
+	}
+
+	if (posix_memalign(&buf, 16, 2 * (vector * TEST_SOURCES * 32))) {
+		printf("alloc error: Fail");
+		return -1;
+	}
+	g_tbls = buf;
+
+	for (i = 0; i < vector; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		dest_ptrs[i] = buf;
+		memset(dest_ptrs[i], 0, TEST_LEN);
+	}
+
+	for (i = 0; i < vector; i++) {
+		if (posix_memalign(&buf, 64, TEST_LEN)) {
+			printf("alloc error: Fail");
+			return -1;
+		}
+		dest_ref[i] = buf;
+		memset(dest_ref[i], 0, TEST_LEN);
+	}
+
+	// Test of all zeros
+	for (i = 0; i < TEST_SOURCES; i++)
+		memset(buffs[i], 0, TEST_LEN);
+
+	switch (vector) {
+	case 6:
+		memset(gf[5], 0xe6, TEST_SOURCES);
+	case 5:
+		memset(gf[4], 4, TEST_SOURCES);
+	case 4:
+		memset(gf[3], 9, TEST_SOURCES);
+	case 3:
+		memset(gf[2], 7, TEST_SOURCES);
+	case 2:
+		memset(gf[1], 1, TEST_SOURCES);
+	case 1:
+		memset(gf[0], 2, TEST_SOURCES);
+		break;
+	default:
+		return -1;
+	}
+
+	for (i = 0; i < TEST_SOURCES; i++)
+		for (j = 0; j < TEST_LEN; j++)
+			buffs[i][j] = rand();
+
+	for (i = 0; i < vector; i++)
+		for (j = 0; j < TEST_SOURCES; j++) {
+			gf[i][j] = rand();
+			gf_vect_mul_init(gf[i][j], &g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
+		}
+
+	for (i = 0; i < vector; i++)
+		gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[i * 32 * TEST_SOURCES],
+				      buffs, dest_ref[i]);
+
+	for (i = 0; i < vector; i++)
+		memset(dest_ptrs[i], 0, TEST_LEN);
+	for (i = 0; i < TEST_SOURCES; i++) {
+#if (VECT == 1)
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], *dest_ptrs);
+#else
+		FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], dest_ptrs);
+#endif
+	}
+	for (i = 0; i < vector; i++) {
+		if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
+			printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test%d\n", i);
+			dump_matrix(buffs, vector, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref[i], 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[i], 25);
+			return -1;
+		}
+	}
+
+#if (VECT == 1)
+	REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, *dest_ref);
+#else
+	REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ref);
+#endif
+	for (i = 0; i < vector; i++) {
+		if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
+			printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test%d\n", i);
+			dump_matrix(buffs, vector, TEST_SOURCES);
+			printf("dprod_base:");
+			dump(dest_ref[i], 25);
+			printf("dprod_dut:");
+			dump(dest_ptrs[i], 25);
+			return -1;
+		}
+	}
+
+	putchar('.');
+
+	// Rand data test
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < vector; i++)
+			for (j = 0; j < TEST_SOURCES; j++) {
+				gf[i][j] = rand();
+				gf_vect_mul_init(gf[i][j],
+						 &g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
+			}
+
+		for (i = 0; i < vector; i++)
+			gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES,
+					      &g_tbls[i * 32 * TEST_SOURCES], buffs,
+					      dest_ref[i]);
+
+		for (i = 0; i < vector; i++)
+			memset(dest_ptrs[i], 0, TEST_LEN);
+		for (i = 0; i < TEST_SOURCES; i++) {
+#if (VECT == 1)
+			FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
+					    *dest_ptrs);
+#else
+			FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
+					    dest_ptrs);
+#endif
+		}
+		for (i = 0; i < vector; i++) {
+			if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test%d %d\n",
+				       i, rtest);
+				dump_matrix(buffs, vector, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref[i], 25);
+				printf("dprod_dut:");
+				dump(dest_ptrs[i], 25);
+				return -1;
+			}
+		}
+
+		putchar('.');
+	}
+
+	// Rand data test with varied parameters
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
+			for (i = 0; i < srcs; i++)
+				for (j = 0; j < TEST_LEN; j++)
+					buffs[i][j] = rand();
+
+			for (i = 0; i < vector; i++)
+				for (j = 0; j < srcs; j++) {
+					gf[i][j] = rand();
+					gf_vect_mul_init(gf[i][j],
+							 &g_tbls[i * (32 * srcs) + j * 32]);
+				}
+
+			for (i = 0; i < vector; i++)
+				gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[i * 32 * srcs],
+						      buffs, dest_ref[i]);
+
+			for (i = 0; i < vector; i++)
+				memset(dest_ptrs[i], 0, TEST_LEN);
+			for (i = 0; i < srcs; i++) {
+#if (VECT == 1)
+				FUNCTION_UNDER_TEST(TEST_LEN, srcs, i, g_tbls, buffs[i],
+						    *dest_ptrs);
+#else
+				FUNCTION_UNDER_TEST(TEST_LEN, srcs, i, g_tbls, buffs[i],
+						    dest_ptrs);
+#endif
+
+			}
+			for (i = 0; i < vector; i++) {
+				if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
+					printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+					       " test%d srcs=%d\n", i, srcs);
+					dump_matrix(buffs, vector, TEST_SOURCES);
+					printf("dprod_base:");
+					dump(dest_ref[i], 25);
+					printf("dprod_dut:");
+					dump(dest_ptrs[i], 25);
+					return -1;
+				}
+			}
+
+			putchar('.');
+		}
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : ALIGN_SIZE;
+	for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < TEST_LEN; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < TEST_SOURCES; i++)	// Line up TEST_SIZE from end
+			efence_buffs[i] = buffs[i] + TEST_LEN - size;
+
+		for (i = 0; i < vector; i++)
+			for (j = 0; j < TEST_SOURCES; j++) {
+				gf[i][j] = rand();
+				gf_vect_mul_init(gf[i][j],
+						 &g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
+			}
+
+		for (i = 0; i < vector; i++)
+			gf_vect_dot_prod_base(size, TEST_SOURCES,
+					      &g_tbls[i * 32 * TEST_SOURCES], efence_buffs,
+					      dest_ref[i]);
+
+		for (i = 0; i < vector; i++)
+			memset(dest_ptrs[i], 0, size);
+		for (i = 0; i < TEST_SOURCES; i++) {
+#if (VECT == 1)
+			FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, efence_buffs[i],
+					    *dest_ptrs);
+#else
+			FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, efence_buffs[i],
+					    dest_ptrs);
+#endif
+		}
+		for (i = 0; i < vector; i++) {
+			if (0 != memcmp(dest_ref[i], dest_ptrs[i], size)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test%d size=%d\n", i, size);
+				dump_matrix(buffs, vector, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref[i], TEST_MIN_SIZE + align);
+				printf("dprod_dut:");
+				dump(dest_ptrs[i], TEST_MIN_SIZE + align);
+				return -1;
+			}
+		}
+
+		putchar('.');
+	}
+
+	// Test rand ptr alignment if available
+
+	for (rtest = 0; rtest < RANDOMS; rtest++) {
+		size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
+		srcs = rand() % TEST_SOURCES;
+		if (srcs == 0)
+			continue;
+
+		offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
+		// Add random offsets
+		for (i = 0; i < srcs; i++)
+			ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+
+		for (i = 0; i < vector; i++) {
+			udest_ptrs[i] = dest_ptrs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
+			memset(dest_ptrs[i], 0, TEST_LEN);	// zero pad to check write-over
+		}
+
+		for (i = 0; i < srcs; i++)
+			for (j = 0; j < size; j++)
+				ubuffs[i][j] = rand();
+
+		for (i = 0; i < vector; i++)
+			for (j = 0; j < srcs; j++) {
+				gf[i][j] = rand();
+				gf_vect_mul_init(gf[i][j], &g_tbls[i * (32 * srcs) + j * 32]);
+			}
+
+		for (i = 0; i < vector; i++)
+			gf_vect_dot_prod_base(size, srcs, &g_tbls[i * 32 * srcs], ubuffs,
+					      dest_ref[i]);
+
+		for (i = 0; i < srcs; i++) {
+#if (VECT == 1)
+			FUNCTION_UNDER_TEST(size, srcs, i, g_tbls, ubuffs[i], *udest_ptrs);
+#else
+			FUNCTION_UNDER_TEST(size, srcs, i, g_tbls, ubuffs[i], udest_ptrs);
+#endif
+		}
+		for (i = 0; i < vector; i++) {
+			if (0 != memcmp(dest_ref[i], udest_ptrs[i], size)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test%d ualign srcs=%d\n", i, srcs);
+				dump_matrix(buffs, vector, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref[i], 25);
+				printf("dprod_dut:");
+				dump(udest_ptrs[i], 25);
+				return -1;
+			}
+		}
+
+		// Confirm that padding around dests is unchanged
+		memset(dest_ref[0], 0, PTR_ALIGN_CHK_B);	// Make reference zero buff
+
+		for (i = 0; i < vector; i++) {
+			offset = udest_ptrs[i] - dest_ptrs[i];
+			if (memcmp(dest_ptrs[i], dest_ref[0], offset)) {
+				printf("Fail rand ualign pad1 start\n");
+				return -1;
+			}
+			if (memcmp
+			    (dest_ptrs[i] + offset + size, dest_ref[0],
+			     PTR_ALIGN_CHK_B - offset)) {
+				printf("Fail rand ualign pad1 end\n");
+				return -1;
+			}
+		}
+
+		putchar('.');
+	}
+
+	// Test all size alignment
+	align = (LEN_ALIGN_CHK_B != 0) ? 1 : ALIGN_SIZE;
+
+	for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
+		for (i = 0; i < TEST_SOURCES; i++)
+			for (j = 0; j < size; j++)
+				buffs[i][j] = rand();
+
+		for (i = 0; i < vector; i++) {
+			for (j = 0; j < TEST_SOURCES; j++) {
+				gf[i][j] = rand();
+				gf_vect_mul_init(gf[i][j],
+						 &g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
+			}
+			memset(dest_ptrs[i], 0, TEST_LEN);	// zero pad to check write-over
+		}
+
+		for (i = 0; i < vector; i++)
+			gf_vect_dot_prod_base(size, TEST_SOURCES,
+					      &g_tbls[i * 32 * TEST_SOURCES], buffs,
+					      dest_ref[i]);
+
+		for (i = 0; i < TEST_SOURCES; i++) {
+#if (VECT == 1)
+			FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, buffs[i],
+					    *dest_ptrs);
+#else
+			FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, buffs[i],
+					    dest_ptrs);
+#endif
+		}
+		for (i = 0; i < vector; i++) {
+			if (0 != memcmp(dest_ref[i], dest_ptrs[i], size)) {
+				printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
+				       " test%d ualign len=%d\n", i, size);
+				dump_matrix(buffs, vector, TEST_SOURCES);
+				printf("dprod_base:");
+				dump(dest_ref[i], 25);
+				printf("dprod_dut:");
+				dump(dest_ptrs[i], 25);
+				return -1;
+			}
+		}
+
+		putchar('.');
+
+	}
+
+	printf("Pass\n");
+	return 0;
+
+}
--- a/erasure_code/gf_vect_mul_avx.asm
+++ b/erasure_code/gf_vect_mul_avx.asm
@ -0,0 +1,164 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mul_avx(len, mul_array, src, dest)
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define return rax
+ %define stack_size  5*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm13, 2*16
+	save_xmm128	xmm14, 3*16
+	save_xmm128	xmm15, 4*16
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp + 0*16]
+	vmovdqa	xmm7, [rsp + 1*16]
+	vmovdqa	xmm13, [rsp + 2*16]
+	vmovdqa	xmm14, [rsp + 3*16]
+	vmovdqa	xmm15, [rsp + 4*16]
+	add	rsp, stack_size
+ %endmacro
+
+%endif
+
+
+%define len   arg0
+%define mul_array arg1
+%define	src   arg2
+%define dest  arg3
+%define pos   return
+
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR vmovdqa
+ %define XSTR vmovdqa
+%else
+ %define XLDR vmovntdqa
+ %define XSTR vmovntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft_lo  xmm14
+%define xgft_hi  xmm13
+
+%define x0     xmm0
+%define xtmp1a xmm1
+%define xtmp1b xmm2
+%define xtmp1c xmm3
+%define x1     xmm4
+%define xtmp2a xmm5
+%define xtmp2b xmm6
+%define xtmp2c xmm7
+
+align 16
+global gf_vect_mul_avx:function
+func(gf_vect_mul_avx)
+	FUNC_SAVE
+	mov	pos, 0
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	vmovdqu	xgft_lo, [mul_array]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xgft_hi, [mul_array+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+
+loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+	XLDR	x1, [src+pos+16]	;Get next source vector + 16B ahead
+	add	pos, 32			;Loop on 16 bytes at a time
+	cmp	pos, len
+	vpand	xtmp1a, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpand	xtmp2a, x1, xmask0f
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpsraw	x1, x1, 4
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vpand	x1, x1, xmask0f
+	vpshufb	xtmp1b, xgft_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmp1c, xgft_lo, xtmp1a	;Lookup mul table of low nibble
+	vpshufb	xtmp2b, xgft_hi, x1	;Lookup mul table of high nibble
+	vpshufb	xtmp2c, xgft_lo, xtmp2a	;Lookup mul table of low nibble
+	vpxor	xtmp1b, xtmp1b, xtmp1c	;GF add high and low partials
+	vpxor	xtmp2b, xtmp2b, xtmp2c
+	XSTR	[dest+pos-32], xtmp1b	;Store result
+	XSTR	[dest+pos-16], xtmp2b	;Store +16B result
+	jl	loop32
+
+
+return_pass:
+	FUNC_RESTORE
+	sub	pos, len
+	ret
+
+return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func             core, ver, snum
+slversion gf_vect_mul_avx, 01,   03,  0036
--- a/erasure_code/gf_vect_mul_avx_perf.c
+++ b/erasure_code/gf_vect_mul_avx_perf.c
@ -0,0 +1,99 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset
+#include "erasure_code.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   4000000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     GT_L3_CACHE / 2
+#  define TEST_LOOPS   1000
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS 1000
+#  endif
+# endif
+#endif
+
+#define TEST_MEM (2 * TEST_LEN)
+
+typedef unsigned char u8;
+
+int main(int argc, char *argv[])
+{
+	int i;
+	u8 *buff1, *buff2, gf_const_tbl[64], a = 2;
+	struct perf start, stop;
+
+	printf("gf_vect_mul_avx_perf:\n");
+
+	gf_vect_mul_init(a, gf_const_tbl);
+
+	// Allocate large mem region
+	buff1 = (u8 *) malloc(TEST_LEN);
+	buff2 = (u8 *) malloc(TEST_LEN);
+	if (NULL == buff1 || NULL == buff2) {
+		printf("Failed to allocate %dB\n", TEST_LEN);
+		return 1;
+	}
+
+	memset(buff1, 0, TEST_LEN);
+	memset(buff2, 0, TEST_LEN);
+
+	gf_vect_mul_avx(TEST_LEN, gf_const_tbl, buff1, buff2);
+
+	printf("Start timed tests\n");
+	fflush(0);
+
+	gf_vect_mul_avx(TEST_LEN, gf_const_tbl, buff1, buff2);
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		gf_vect_mul_init(a, gf_const_tbl);
+		gf_vect_mul_avx(TEST_LEN, gf_const_tbl, buff1, buff2);
+	}
+	perf_stop(&stop);
+	printf("gf_vect_mul_avx" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	return 0;
+}
--- a/erasure_code/gf_vect_mul_avx_test.c
+++ b/erasure_code/gf_vect_mul_avx_test.c
@ -0,0 +1,143 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset
+#include "erasure_code.h"
+
+#define TEST_SIZE 8192
+#define TEST_MEM  TEST_SIZE
+#define TEST_LOOPS 100000
+#define TEST_TYPE_STR ""
+
+typedef unsigned char u8;
+
+int main(int argc, char *argv[])
+{
+	int i;
+	u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
+	int align, size;
+	unsigned char *efence_buff1;
+	unsigned char *efence_buff2;
+	unsigned char *efence_buff3;
+
+	printf("gf_vect_mul_avx:\n");
+
+	gf_vect_mul_init(a, gf_const_tbl);
+
+	buff1 = (u8 *) malloc(TEST_SIZE);
+	buff2 = (u8 *) malloc(TEST_SIZE);
+	buff3 = (u8 *) malloc(TEST_SIZE);
+
+	if (NULL == buff1 || NULL == buff2 || NULL == buff3) {
+		printf("buffer alloc error\n");
+		return -1;
+	}
+	// Fill with rand data
+	for (i = 0; i < TEST_SIZE; i++)
+		buff1[i] = rand();
+
+	gf_vect_mul_avx(TEST_SIZE, gf_const_tbl, buff1, buff2);
+
+	for (i = 0; i < TEST_SIZE; i++)
+		if (gf_mul(a, buff1[i]) != buff2[i]) {
+			printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i, buff1[i], buff2[i],
+			       gf_mul(2, buff1[i]));
+			return 1;
+		}
+
+	gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
+
+	// Check reference function
+	for (i = 0; i < TEST_SIZE; i++)
+		if (buff2[i] != buff3[i]) {
+			printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
+			       i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
+			return 1;
+		}
+
+	for (i = 0; i < TEST_SIZE; i++)
+		buff1[i] = rand();
+
+	// Check each possible constant
+	printf("Random tests ");
+	for (a = 0; a != 255; a++) {
+		gf_vect_mul_init(a, gf_const_tbl);
+		gf_vect_mul_avx(TEST_SIZE, gf_const_tbl, buff1, buff2);
+
+		for (i = 0; i < TEST_SIZE; i++)
+			if (gf_mul(a, buff1[i]) != buff2[i]) {
+				printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
+				       i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
+				return 1;
+			}
+		putchar('.');
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = 32;
+	a = 2;
+
+	gf_vect_mul_init(a, gf_const_tbl);
+	for (size = 0; size < TEST_SIZE; size += align) {
+		// Line up TEST_SIZE from end
+		efence_buff1 = buff1 + size;
+		efence_buff2 = buff2 + size;
+		efence_buff3 = buff3 + size;
+
+		gf_vect_mul_avx(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2);
+
+		for (i = 0; i < TEST_SIZE - size; i++)
+			if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
+				printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
+				       i, efence_buff1[i], efence_buff2[i], gf_mul(2,
+										   efence_buff1
+										   [i]));
+				return 1;
+			}
+
+		gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff3);
+
+		// Check reference function
+		for (i = 0; i < TEST_SIZE - size; i++)
+			if (efence_buff2[i] != efence_buff3[i]) {
+				printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
+				       i, a, efence_buff2[i], efence_buff3[i], gf_mul(2,
+										      efence_buff1
+										      [i]));
+				return 1;
+			}
+
+		putchar('.');
+	}
+
+	printf(" done: Pass\n");
+	return 0;
+}
--- a/erasure_code/gf_vect_mul_base_test.c
+++ b/erasure_code/gf_vect_mul_base_test.c
@ -0,0 +1,129 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset
+#include "erasure_code.h"
+
+#define TEST_SIZE 8192
+#define TEST_MEM  TEST_SIZE
+#define TEST_LOOPS 100000
+#define TEST_TYPE_STR ""
+
+typedef unsigned char u8;
+
+int main(int argc, char *argv[])
+{
+	int i;
+	u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
+	int align, size;
+	unsigned char *efence_buff1;
+	unsigned char *efence_buff2;
+
+	printf("gf_vect_mul_base_test:\n");
+
+	gf_vect_mul_init(a, gf_const_tbl);
+
+	buff1 = (u8 *) malloc(TEST_SIZE);
+	buff2 = (u8 *) malloc(TEST_SIZE);
+	buff3 = (u8 *) malloc(TEST_SIZE);
+
+	if (NULL == buff1 || NULL == buff2 || NULL == buff3) {
+		printf("buffer alloc error\n");
+		return -1;
+	}
+	// Fill with rand data
+	for (i = 0; i < TEST_SIZE; i++)
+		buff1[i] = rand();
+
+	gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2);
+
+	for (i = 0; i < TEST_SIZE; i++)
+		if (gf_mul(a, buff1[i]) != buff2[i]) {
+			printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i, buff1[i], buff2[i],
+			       gf_mul(2, buff1[i]));
+			return 1;
+		}
+
+	gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
+
+	// Check reference function
+	for (i = 0; i < TEST_SIZE; i++)
+		if (buff2[i] != buff3[i]) {
+			printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
+			       i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
+			return 1;
+		}
+
+	for (i = 0; i < TEST_SIZE; i++)
+		buff1[i] = rand();
+
+	// Check each possible constant
+	printf("Random tests ");
+	for (a = 0; a != 255; a++) {
+		gf_vect_mul_init(a, gf_const_tbl);
+		gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2);
+
+		for (i = 0; i < TEST_SIZE; i++)
+			if (gf_mul(a, buff1[i]) != buff2[i]) {
+				printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
+				       i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
+				return 1;
+			}
+		putchar('.');
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = 32;
+	a = 2;
+
+	gf_vect_mul_init(a, gf_const_tbl);
+	for (size = 0; size < TEST_SIZE; size += align) {
+		// Line up TEST_SIZE from end
+		efence_buff1 = buff1 + size;
+		efence_buff2 = buff2 + size;
+
+		gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2);
+
+		for (i = 0; i < TEST_SIZE - size; i++)
+			if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
+				printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
+				       i, efence_buff1[i], efence_buff2[i], gf_mul(2,
+										   efence_buff1
+										   [i]));
+				return 1;
+			}
+
+		putchar('.');
+	}
+
+	printf(" done: Pass\n");
+	return 0;
+}
--- a/erasure_code/gf_vect_mul_perf.c
+++ b/erasure_code/gf_vect_mul_perf.c
@ -0,0 +1,99 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset
+#include "erasure_code.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   4000000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     GT_L3_CACHE / 2
+#  define TEST_LOOPS   1000
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS 1000
+#  endif
+# endif
+#endif
+
+#define TEST_MEM (2 * TEST_LEN)
+
+typedef unsigned char u8;
+
+int main(int argc, char *argv[])
+{
+	int i;
+	u8 *buff1, *buff2, gf_const_tbl[64], a = 2;
+	struct perf start, stop;
+
+	printf("gf_vect_mul_perf:\n");
+
+	gf_vect_mul_init(a, gf_const_tbl);
+
+	// Allocate large mem region
+	buff1 = (u8 *) malloc(TEST_LEN);
+	buff2 = (u8 *) malloc(TEST_LEN);
+	if (NULL == buff1 || NULL == buff2) {
+		printf("Failed to allocate %dB\n", TEST_LEN);
+		return 1;
+	}
+
+	memset(buff1, 0, TEST_LEN);
+	memset(buff2, 0, TEST_LEN);
+
+	gf_vect_mul(TEST_LEN, gf_const_tbl, buff1, buff2);
+
+	printf("Start timed tests\n");
+	fflush(0);
+
+	gf_vect_mul(TEST_LEN, gf_const_tbl, buff1, buff2);
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		gf_vect_mul_init(a, gf_const_tbl);
+		gf_vect_mul(TEST_LEN, gf_const_tbl, buff1, buff2);
+	}
+	perf_stop(&stop);
+	printf("gf_vect_mul" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	return 0;
+}
--- a/erasure_code/gf_vect_mul_sse.asm
+++ b/erasure_code/gf_vect_mul_sse.asm
@ -0,0 +1,170 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mul_sse(len, mul_array, src, dest)
+;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+
+%elifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define return rax
+ %define stack_size  5*16 + 8 	; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	alloc_stack	stack_size
+	save_xmm128	xmm6, 0*16
+	save_xmm128	xmm7, 1*16
+	save_xmm128	xmm13, 2*16
+	save_xmm128	xmm14, 3*16
+	save_xmm128	xmm15, 4*16
+	end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp + 0*16]
+	movdqa	xmm7, [rsp + 1*16]
+	movdqa	xmm13, [rsp + 2*16]
+	movdqa	xmm14, [rsp + 3*16]
+	movdqa	xmm15, [rsp + 4*16]
+	add	rsp, stack_size
+ %endmacro
+
+%endif
+
+
+%define len    arg0
+%define mul_array arg1
+%define	src    arg2
+%define dest   arg3
+%define pos    return
+
+
+;;; Use Non-temporal load/stor
+%ifdef NO_NT_LDST
+ %define XLDR movdqa
+ %define XSTR movdqa
+%else
+ %define XLDR movntdqa
+ %define XSTR movntdq
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft_lo  xmm14
+%define xgft_hi  xmm13
+
+%define x0     xmm0
+%define xtmp1a xmm1
+%define xtmp1b xmm2
+%define xtmp1c xmm3
+%define x1     xmm4
+%define xtmp2a xmm5
+%define xtmp2b xmm6
+%define xtmp2c xmm7
+
+
+align 16
+global gf_vect_mul_sse:function
+func(gf_vect_mul_sse)
+	FUNC_SAVE
+	mov	pos, 0
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	movdqu	xgft_lo, [mul_array]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xgft_hi, [mul_array+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+
+loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+	XLDR	x1, [src+pos+16]	;Get next source vector + 16B ahead
+	movdqa	xtmp1b, xgft_hi		;Reload const array registers
+	movdqa	xtmp1c, xgft_lo
+	movdqa	xtmp2b, xgft_hi
+	movdqa	xtmp2c, xgft_lo
+	movdqa	xtmp1a, x0		;Keep unshifted copy of src
+	movdqa	xtmp2a, x1
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	psraw	x1, 4
+	pand	xtmp1a, xmask0f		;Mask low src nibble in bits 4-0
+	pand	xtmp2a, xmask0f
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	x1, xmask0f
+	pshufb	xtmp1b, x0		;Lookup mul table of high nibble
+	pshufb	xtmp1c, xtmp1a		;Lookup mul table of low nibble
+	pshufb	xtmp2b, x1
+	pshufb	xtmp2c, xtmp2a
+	pxor	xtmp1b, xtmp1c		;GF add high and low partials
+	pxor	xtmp2b, xtmp2c
+	XSTR	[dest+pos], xtmp1b 	;Store result
+	XSTR	[dest+pos+16], xtmp2b	;Store +16B result
+	add	pos, 32			;Loop on 32 bytes at at time
+	cmp	pos, len
+	jl	loop32
+
+
+return_pass:
+	sub	pos, len
+	FUNC_RESTORE
+	ret
+
+return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:
+ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+;;;       func        core, ver, snum
+slversion gf_vect_mul_sse, 00,   03,  0034
--- a/erasure_code/gf_vect_mul_sse_perf.c
+++ b/erasure_code/gf_vect_mul_sse_perf.c
@ -0,0 +1,97 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset
+#include "erasure_code.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN     8*1024
+# define TEST_LOOPS   4000000
+# define TEST_TYPE_STR "_warm"
+#else
+# ifndef TEST_CUSTOM
+// Uncached test.  Pull from large mem base.
+#  define TEST_SOURCES 10
+#  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
+#  define TEST_LEN     GT_L3_CACHE / 2
+#  define TEST_LOOPS   1000
+#  define TEST_TYPE_STR "_cold"
+# else
+#  define TEST_TYPE_STR "_cus"
+#  ifndef TEST_LOOPS
+#    define TEST_LOOPS 1000
+#  endif
+# endif
+#endif
+
+#define TEST_MEM (2 * TEST_LEN)
+
+typedef unsigned char u8;
+
+int main(int argc, char *argv[])
+{
+	int i;
+	u8 *buff1, *buff2, gf_const_tbl[64], a = 2;
+	struct perf start, stop;
+
+	printf("gf_vect_mul_sse_perf:\n");
+
+	gf_vect_mul_init(a, gf_const_tbl);
+
+	// Allocate large mem region
+	buff1 = (u8 *) malloc(TEST_LEN);
+	buff2 = (u8 *) malloc(TEST_LEN);
+	if (NULL == buff1 || NULL == buff2) {
+		printf("Failed to allocate %dB\n", TEST_LEN);
+		return 1;
+	}
+
+	memset(buff1, 0, TEST_LEN);
+	memset(buff2, 0, TEST_LEN);
+
+	printf("Start timed tests\n");
+	fflush(0);
+
+	gf_vect_mul_sse(TEST_LEN, gf_const_tbl, buff1, buff2);
+	perf_start(&start);
+	for (i = 0; i < TEST_LOOPS; i++) {
+		gf_vect_mul_init(a, gf_const_tbl);	// in a re-build would only calc once
+		gf_vect_mul_sse(TEST_LEN, gf_const_tbl, buff1, buff2);
+	}
+	perf_stop(&stop);
+	printf("gf_vect_mul_sse" TEST_TYPE_STR ": ");
+	perf_print(stop, start, (long long)TEST_LEN * i);
+
+	return 0;
+}
--- a/erasure_code/gf_vect_mul_sse_test.c
+++ b/erasure_code/gf_vect_mul_sse_test.c
@ -0,0 +1,160 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "erasure_code.h"
+
+#define TEST_SIZE (128*1024)
+
+typedef unsigned char u8;
+
+int main(int argc, char *argv[])
+{
+	int i;
+	u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
+	int tsize;
+	int align, size;
+	unsigned char *efence_buff1;
+	unsigned char *efence_buff2;
+	unsigned char *efence_buff3;
+
+	printf("gf_vect_mul_sse_test: ");
+
+	gf_vect_mul_init(a, gf_const_tbl);
+
+	buff1 = (u8 *) malloc(TEST_SIZE);
+	buff2 = (u8 *) malloc(TEST_SIZE);
+	buff3 = (u8 *) malloc(TEST_SIZE);
+
+	if (NULL == buff1 || NULL == buff2 || NULL == buff3) {
+		printf("buffer alloc error\n");
+		return -1;
+	}
+	// Fill with rand data
+	for (i = 0; i < TEST_SIZE; i++)
+		buff1[i] = rand();
+
+	gf_vect_mul_sse(TEST_SIZE, gf_const_tbl, buff1, buff2);
+
+	for (i = 0; i < TEST_SIZE; i++) {
+		if (gf_mul(a, buff1[i]) != buff2[i]) {
+			printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i,
+			       buff1[i], buff2[i], gf_mul(2, buff1[i]));
+			return -1;
+		}
+	}
+
+	gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
+
+	// Check reference function
+	for (i = 0; i < TEST_SIZE; i++) {
+		if (buff2[i] != buff3[i]) {
+			printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
+			       i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
+			return -1;
+		}
+	}
+
+	for (i = 0; i < TEST_SIZE; i++)
+		buff1[i] = rand();
+
+	// Check each possible constant
+	for (a = 0; a != 255; a++) {
+		gf_vect_mul_init(a, gf_const_tbl);
+		gf_vect_mul_sse(TEST_SIZE, gf_const_tbl, buff1, buff2);
+
+		for (i = 0; i < TEST_SIZE; i++)
+			if (gf_mul(a, buff1[i]) != buff2[i]) {
+				printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
+				       i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
+				return -1;
+			}
+		putchar('.');
+	}
+
+	// Check buffer len
+	for (tsize = TEST_SIZE; tsize > 0; tsize -= 32) {
+		a = rand();
+		gf_vect_mul_init(a, gf_const_tbl);
+		gf_vect_mul_sse(tsize, gf_const_tbl, buff1, buff2);
+
+		for (i = 0; i < tsize; i++)
+			if (gf_mul(a, buff1[i]) != buff2[i]) {
+				printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
+				       i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
+				return -1;
+			}
+		if (0 == tsize % (32 * 8)) {
+			putchar('.');
+			fflush(0);
+		}
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = 32;
+	a = 2;
+
+	gf_vect_mul_init(a, gf_const_tbl);
+	for (size = 0; size < TEST_SIZE; size += align) {
+		// Line up TEST_SIZE from end
+		efence_buff1 = buff1 + size;
+		efence_buff2 = buff2 + size;
+		efence_buff3 = buff3 + size;
+
+		gf_vect_mul_sse(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2);
+
+		for (i = 0; i < TEST_SIZE - size; i++)
+			if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
+				printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
+				       i, efence_buff1[i], efence_buff2[i], gf_mul(2,
+										   efence_buff1
+										   [i]));
+				return 1;
+			}
+
+		gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff3);
+
+		// Check reference function
+		for (i = 0; i < TEST_SIZE - size; i++)
+			if (efence_buff2[i] != efence_buff3[i]) {
+				printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
+				       i, a, efence_buff2[i], efence_buff3[i], gf_mul(2,
+										      efence_buff1
+										      [i]));
+				return 1;
+			}
+
+		putchar('.');
+	}
+
+	printf(" done: Pass\n");
+	fflush(0);
+	return 0;
+}
--- a/erasure_code/gf_vect_mul_test.c
+++ b/erasure_code/gf_vect_mul_test.c
@ -0,0 +1,142 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>		// for memset
+#include "erasure_code.h"
+
+#define TEST_SIZE 8192
+#define TEST_MEM  TEST_SIZE
+#define TEST_LOOPS 100000
+#define TEST_TYPE_STR ""
+
+typedef unsigned char u8;
+
+int main(int argc, char *argv[])
+{
+	int i;
+	u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
+	int align, size;
+	unsigned char *efence_buff1;
+	unsigned char *efence_buff2;
+	unsigned char *efence_buff3;
+
+	printf("gf_vect_mul_test:\n");
+
+	gf_vect_mul_init(a, gf_const_tbl);
+
+	buff1 = (u8 *) malloc(TEST_SIZE);
+	buff2 = (u8 *) malloc(TEST_SIZE);
+	buff3 = (u8 *) malloc(TEST_SIZE);
+
+	if (NULL == buff1 || NULL == buff2 || NULL == buff3) {
+		printf("buffer alloc error\n");
+		return -1;
+	}
+	// Fill with rand data
+	for (i = 0; i < TEST_SIZE; i++)
+		buff1[i] = rand();
+
+	gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2);
+
+	for (i = 0; i < TEST_SIZE; i++)
+		if (gf_mul(a, buff1[i]) != buff2[i]) {
+			printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i, buff1[i], buff2[i],
+			       gf_mul(2, buff1[i]));
+			return 1;
+		}
+
+	gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
+
+	// Check reference function
+	for (i = 0; i < TEST_SIZE; i++)
+		if (buff2[i] != buff3[i]) {
+			printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
+			       i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
+			return 1;
+		}
+
+	for (i = 0; i < TEST_SIZE; i++)
+		buff1[i] = rand();
+
+	// Check each possible constant
+	printf("Random tests ");
+	for (a = 0; a != 255; a++) {
+		gf_vect_mul_init(a, gf_const_tbl);
+		gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2);
+
+		for (i = 0; i < TEST_SIZE; i++) {
+			if (gf_mul(a, buff1[i]) != buff2[i]) {
+				printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
+				       i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
+				return 1;
+			}
+		}
+		putchar('.');
+	}
+
+	// Run tests at end of buffer for Electric Fence
+	align = 32;
+	a = 2;
+
+	gf_vect_mul_init(a, gf_const_tbl);
+	for (size = 0; size < TEST_SIZE; size += align) {
+		// Line up TEST_SIZE from end
+		efence_buff1 = buff1 + size;
+		efence_buff2 = buff2 + size;
+		efence_buff3 = buff3 + size;
+
+		gf_vect_mul(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2);
+
+		for (i = 0; i < TEST_SIZE - size; i++)
+			if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
+				printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
+				       i, efence_buff1[i], efence_buff2[i],
+				       gf_mul(2, efence_buff1[i]));
+				return 1;
+			}
+
+		gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff3);
+
+		// Check reference function
+		for (i = 0; i < TEST_SIZE - size; i++)
+			if (efence_buff2[i] != efence_buff3[i]) {
+				printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
+				       i, a, efence_buff2[i], efence_buff3[i],
+				       gf_mul(2, efence_buff1[i]));
+				return 1;
+			}
+
+		putchar('.');
+	}
+
+	printf(" done: Pass\n");
+	return 0;
+}
--- a/include/erasure_code.h
+++ b/include/erasure_code.h
@ -0,0 +1,933 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _ERASURE_CODE_H_
+#define _ERASURE_CODE_H_
+
+/**
+ *  @file erasure_code.h
+ *  @brief Interface to functions supporting erasure code encode and decode.
+ *
+ *  This file defines the interface to optimized functions used in erasure
+ *  codes.  Encode and decode of erasures in GF(2^8) are made by calculating the
+ *  dot product of the symbols (bytes in GF(2^8)) across a set of buffers and a
+ *  set of coefficients.  Values for the coefficients are determined by the type
+ *  of erasure code.  Using a general dot product means that any sequence of
+ *  coefficients may be used including erasure codes based on random 
+ *  coefficients.
+ *  Multiple versions of dot product are supplied to calculate 1-6 output
+ *  vectors in one pass.
+ *  Base GF multiply and divide functions can be sped up by defining
+ *  GF_LARGE_TABLES at the expense of memory size.
+ *
+ */
+
+#include "gf_vect_mul.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Initialize tables for fast Erasure Code encode and decode.
+ *
+ * Generates the expanded tables needed for fast encode or decode for erasure
+ * codes on blocks of data.  32bytes is generated for each input coefficient.
+ *
+ * @param k      The number of vector sources or rows in the generator matrix
+ *               for coding.
+ * @param rows   The number of output vectors to concurrently encode/decode.
+ * @param a      Pointer to sets of arrays of input coefficients used to encode
+ *               or decode data.
+ * @param gftbls Pointer to start of space for concatenated output tables
+ *               generated from input coefficients.  Must be of size 32*k*rows.
+ * @returns none
+ */
+
+void ec_init_tables(int k, int rows, unsigned char* a, unsigned char* gftbls);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data, runs appropriate version.
+ *
+ * Given a list of source data blocks, generate one or multiple blocks of
+ * encoded data as specified by a matrix of GF(2^8) coefficients. When given a
+ * suitable set of coefficients, this function will perform the fast generation
+ * or decoding of Reed-Solomon type erasure codes.
+ * 
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ * 
+ * @param len    Length of each block of data (vector) of source or dest data.
+ * @param k      The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param rows   The number of output vectors to concurrently encode/decode.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*k*rows
+ * @param data   Array of pointers to source input buffers.
+ * @param coding Array of pointers to coded output buffers.
+ * @returns none
+ */
+
+void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+		    unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
+ *
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires SSE4.1
+ */
+void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
+ *
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires AVX
+ */
+void ec_encode_data_avx(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
+ *
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires AVX2
+ */
+void ec_encode_data_avx2(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			 unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data, runs baseline version.
+ *
+ * Baseline version of ec_encode_data() with same parameters.
+ */
+void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src,
+			 unsigned char **dest);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source, runs appropriate version.
+ *
+ * Given one source data block, update one or multiple blocks of encoded data as
+ * specified by a matrix of GF(2^8) coefficients. When given a suitable set of
+ * coefficients, this function will perform the fast generation or decoding of
+ * Reed-Solomon type erasure codes from one input source at a time.
+ * 
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ * 
+ * @param len    Length of each block of data (vector) of source or dest data.
+ * @param k      The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param rows   The number of output vectors to concurrently encode/decode.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param g_tbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*k*rows
+ * @param data   Pointer to single input source used to update output parity.
+ * @param coding Array of pointers to coded output buffers.
+ * @returns none
+ */
+void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			   unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires SSE4.1
+ */
+
+void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires AVX
+ */
+
+void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires AVX2
+ */
+
+void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+				unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Baseline version of ec_encode_data_update().
+ */
+
+void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v, 
+				unsigned char *data, unsigned char **dest);
+
+
+/**
+ * @brief GF(2^8) vector dot product.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ *               on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, 
+			unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * @requires AVX
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ *               on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, 
+			unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * @requires AVX2
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ *               on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, 
+			unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product with two outputs.
+ *
+ * Vector dot product optimized to calculate two ouputs at a time. Does two
+ * GF(2^8) dot products across each byte of the input array and two constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 2*32*vlen byte constant array based on the two sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_2vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with two outputs.
+ *
+ * Vector dot product optimized to calculate two ouputs at a time. Does two
+ * GF(2^8) dot products across each byte of the input array and two constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 2*32*vlen byte constant array based on the two sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_2vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with two outputs.
+ *
+ * Vector dot product optimized to calculate two ouputs at a time. Does two
+ * GF(2^8) dot products across each byte of the input array and two constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 2*32*vlen byte constant array based on the two sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_2vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with three outputs.
+ *
+ * Vector dot product optimized to calculate three ouputs at a time. Does three
+ * GF(2^8) dot products across each byte of the input array and three constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 3*32*vlen byte constant array based on the three sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_3vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with three outputs.
+ *
+ * Vector dot product optimized to calculate three ouputs at a time. Does three
+ * GF(2^8) dot products across each byte of the input array and three constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 3*32*vlen byte constant array based on the three sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_3vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with three outputs.
+ *
+ * Vector dot product optimized to calculate three ouputs at a time. Does three
+ * GF(2^8) dot products across each byte of the input array and three constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 3*32*vlen byte constant array based on the three sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_3vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with four outputs.
+ *
+ * Vector dot product optimized to calculate four ouputs at a time. Does four
+ * GF(2^8) dot products across each byte of the input array and four constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 4*32*vlen byte constant array based on the four sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_4vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with four outputs.
+ *
+ * Vector dot product optimized to calculate four ouputs at a time. Does four
+ * GF(2^8) dot products across each byte of the input array and four constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 4*32*vlen byte constant array based on the four sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_4vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with four outputs.
+ *
+ * Vector dot product optimized to calculate four ouputs at a time. Does four
+ * GF(2^8) dot products across each byte of the input array and four constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 4*32*vlen byte constant array based on the four sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_4vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with five outputs.
+ *
+ * Vector dot product optimized to calculate five ouputs at a time. Does five
+ * GF(2^8) dot products across each byte of the input array and five constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 5*32*vlen byte constant array based on the five sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_5vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with five outputs.
+ *
+ * Vector dot product optimized to calculate five ouputs at a time. Does five
+ * GF(2^8) dot products across each byte of the input array and five constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 5*32*vlen byte constant array based on the five sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len    Length of each vector in bytes. Must >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_5vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with five outputs.
+ *
+ * Vector dot product optimized to calculate five ouputs at a time. Does five
+ * GF(2^8) dot products across each byte of the input array and five constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 5*32*vlen byte constant array based on the five sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len    Length of each vector in bytes. Must >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_5vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with six outputs.
+ *
+ * Vector dot product optimized to calculate six ouputs at a time. Does six
+ * GF(2^8) dot products across each byte of the input array and six constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 6*32*vlen byte constant array based on the six sets of input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_6vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with six outputs.
+ *
+ * Vector dot product optimized to calculate six ouputs at a time. Does six
+ * GF(2^8) dot products across each byte of the input array and six constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 6*32*vlen byte constant array based on the six sets of input coefficients.
+ * @requires AVX
+ *
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_6vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with six outputs.
+ *
+ * Vector dot product optimized to calculate six ouputs at a time. Does six
+ * GF(2^8) dot products across each byte of the input array and six constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 6*32*vlen byte constant array based on the six sets of input coefficients.
+ * @requires AVX2
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.  
+ * @returns none
+ */
+
+void gf_6vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
+			unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product, runs baseline version.
+ * 
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * 
+ * @param len    Length of each vector in bytes. Must be >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ *               on the array of input coefficients. Only elements 32*CONST*j + 1 
+ *               of this array are used, where j = (0, 1, 2...) and CONST is the
+ *               number of elements in the array of input coefficients. The 
+ *               elements used correspond to the original input coefficients.		
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_base(int len, int vlen, unsigned char *gftbls,
+                        unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product, runs appropriate version.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * 
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ *               on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod(int len, int vlen, unsigned char *gftbls,
+                        unsigned char **src, unsigned char *dest);
+
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, runs appropriate version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and add to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32*vec byte constant array based on the input
+ * coefficients.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mad(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		 unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires SSE4.1
+ */
+
+void gf_vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		     unsigned char *dest);
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires AVX
+ */
+
+void gf_vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		     unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires AVX2
+ */
+
+void gf_vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, baseline version.
+ *
+ * Baseline version of gf_vect_mad() with same parameters.
+ */
+
+void gf_vect_mad_base(int len, int vec, int vec_i, unsigned char *v, unsigned char *src,
+		      unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate.  SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_2vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. AVX version of gf_2vect_mad_sse().
+ * @requires AVX
+ */
+void gf_2vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. AVX2 version of gf_2vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_2vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_3vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. AVX version of gf_3vect_mad_sse().
+ * @requires AVX
+ */
+void gf_3vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. AVX2 version of gf_3vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_3vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_4vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. AVX version of gf_4vect_mad_sse().
+ * @requires AVX
+ */
+void gf_4vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. AVX2 version of gf_4vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_4vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. SSE version.
+ * @requires SSE4.1
+ */
+void gf_5vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. AVX version.
+ * @requires AVX
+ */
+void gf_5vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. AVX2 version.
+ * @requires AVX2
+ */
+void gf_5vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. SSE version.
+ * @requires SSE4.1
+ */
+void gf_6vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. AVX version.
+ * @requires AVX
+ */
+void gf_6vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. AVX2 version.
+ * @requires AVX2
+ */
+void gf_6vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+
+/**********************************************************************
+ * The remaining are lib support functions used in GF(2^8) operations.
+ */
+
+/**
+ * @brief Single element GF(2^8) multiply.
+ *
+ * @param a  Multiplicand a
+ * @param b  Multiplicand b
+ * @returns  Product of a and b in GF(2^8)
+ */
+
+unsigned char gf_mul(unsigned char a, unsigned char b);
+
+/**
+ * @brief Single element GF(2^8) inverse.
+ *
+ * @param a  Input element
+ * @returns  Field element b such that a x b = {1}
+ */
+
+unsigned char gf_inv(unsigned char a);
+
+/**
+ * @brief Generate a matrix of coefficients to be used for encoding.
+ *
+ * Vandermonde matrix example of encoding coefficients where high portion of
+ * matrix is identity matrix I and lower portion is constructed as 2^{i*(j-k+1)}
+ * i:{0,k-1} j:{k,m-1}. Commonly used method for choosing coefficients in
+ * erasure encoding but does not guarantee invertable for every sub matrix.  For
+ * large k it is possible to find cases where the decode matrix chosen from
+ * sources and parity not in erasure are not invertable. Users may want to
+ * adjust for k > 5.
+ *
+ * @param a  [mxk] array to hold coefficients
+ * @param m  number of rows in matrix corresponding to srcs + parity.
+ * @param k  number of columns in matrix corresponding to srcs.
+ * @returns  none
+ */
+
+void gf_gen_rs_matrix(unsigned char *a, int m, int k);
+
+/**
+ * @brief Generate a Cauchy matrix of coefficients to be used for encoding.
+ *
+ * Cauchy matrix example of encoding coefficients where high portion of matrix
+ * is identity matrix I and lower portion is constructed as 1/(i + j) | i != j,
+ * i:{0,k-1} j:{k,m-1}.  Any sub-matrix of a Cauchy matrix should be invertable.
+ *
+ * @param a  [mxk] array to hold coefficients
+ * @param m  number of rows in matrix corresponding to srcs + parity.
+ * @param k  number of columns in matrix corresponding to srcs.
+ * @returns  none
+ */
+
+void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k);
+
+/**
+ * @brief Invert a matrix in GF(2^8)
+ *
+ * @param in  input matrix
+ * @param out output matrix such that [in] x [out] = [I] - identity matrix
+ * @param n   size of matrix [nxn]
+ * @returns 0 successful, other fail on singular input matrix
+ */
+
+int gf_invert_matrix(unsigned char *in, unsigned char *out, const int n);
+
+
+/*************************************************************/
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ERASURE_CODE_H_
--- a/include/gf_vect_mul.h
+++ b/include/gf_vect_mul.h
@ -0,0 +1,148 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _GF_VECT_MUL_H
+#define _GF_VECT_MUL_H
+
+/**
+ *  @file gf_vect_mul.h
+ *  @brief Interface to functions for vector (block) multiplication in GF(2^8).
+ *
+ *  This file defines the interface to routines used in fast RAID rebuild and 
+ *  erasure codes.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /**
+ * @brief GF(2^8) vector multiply by constant.
+ *
+ * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
+ * is a single field element in GF(2^8). Can be used for RAID6 rebuild
+ * and partial write functions. Function requires pre-calculation of a
+ * 32-element constant array based on constant C. gftbl(C) = {C{00},
+ * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
+ * and src must be aligned to 32B.
+ * @requires SSE4.1
+ *
+ * @param len   Length of vector in bytes. Must be aligned to 32B.
+ * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
+ * @param src   Pointer to src data array. Must be aligned to 32B.
+ * @param dest  Pointer to destination data array. Must be aligned to 32B.
+ * @returns 0 pass, other fail
+ */
+
+int gf_vect_mul_sse(int len, unsigned char *gftbl, void *src, void *dest);
+
+
+ /**
+ * @brief GF(2^8) vector multiply by constant.
+ *
+ * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
+ * is a single field element in GF(2^8). Can be used for RAID6 rebuild
+ * and partial write functions. Function requires pre-calculation of a
+ * 32-element constant array based on constant C. gftbl(C) = {C{00},
+ * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
+ * and src must be aligned to 32B.
+ * @requires AVX
+ *
+ * @param len   Length of vector in bytes. Must be aligned to 32B.
+ * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
+ * @param src   Pointer to src data array. Must be aligned to 32B.
+ * @param dest  Pointer to destination data array. Must be aligned to 32B.
+ * @returns 0 pass, other fail
+ */
+
+int gf_vect_mul_avx(int len, unsigned char *gftbl, void *src, void *dest);
+
+
+/**
+ * @brief GF(2^8) vector multiply by constant, runs appropriate version.
+ * 	
+ * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
+ * is a single field element in GF(2^8). Can be used for RAID6 rebuild
+ * and partial write functions. Function requires pre-calculation of a
+ * 32-element constant array based on constant C. gftbl(C) = {C{00},
+ * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }.
+ * Len and src must be aligned to 32B.
+ *
+ * This function determines what instruction sets are enabled 
+ * and selects the appropriate version at runtime. 
+ * 
+ * @param len   Length of vector in bytes. Must be aligned to 32B.
+ * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
+ * @param src   Pointer to src data array. Must be aligned to 32B.
+ * @param dest  Pointer to destination data array. Must be aligned to 32B.
+ * @returns 0 pass, other fail
+ */
+
+int gf_vect_mul(int len, unsigned char *gftbl, void *src, void *dest);
+
+
+/**
+ * @brief Initialize 32-byte constant array for GF(2^8) vector multiply
+ *
+ * Calculates array {C{00}, C{01}, C{02}, ... , C{0f} }, {C{00}, C{10},
+ * C{20}, ... , C{f0} } as required by other fast vector multiply
+ * functions.
+ * @param c     Constant input.
+ * @param gftbl Table output.
+ */
+
+void gf_vect_mul_init(unsigned char c, unsigned char* gftbl);
+
+
+/**
+ * @brief GF(2^8) vector multiply by constant, runs baseline version.
+ *
+ * Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
+ * is a single field element in GF(2^8). Can be used for RAID6 rebuild
+ * and partial write functions. Function requires pre-calculation of a
+ * 32-element constant array based on constant C. gftbl(C) = {C{00},
+ * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
+ * and src must be aligned to 32B.
+ *
+ * @param len   Length of vector in bytes. Must be aligned to 32B.
+ * @param a 	Pointer to 32-byte array of pre-calculated constants based on C.
+ * 		only use 2nd element is used.
+ * @param src   Pointer to src data array. Must be aligned to 32B.
+ * @param dest  Pointer to destination data array. Must be aligned to 32B.
+ */
+
+void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, 
+			unsigned char *dest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_GF_VECT_MUL_H
--- a/include/reg_sizes.asm
+++ b/include/reg_sizes.asm
@ -0,0 +1,123 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions 
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _REG_SIZES_ASM_
+%define _REG_SIZES_ASM_
+
+%define EFLAGS_HAS_CPUID        (1<<21)
+%define FLAG_CPUID1_ECX_CLMUL   (1<<1)
+%define FLAG_CPUID1_EDX_SSE2    (1<<26)
+%define FLAG_CPUID1_ECX_SSE3	(1)
+%define FLAG_CPUID1_ECX_SSE4_1  (1<<19)
+%define FLAG_CPUID1_ECX_SSE4_2  (1<<20)
+%define FLAG_CPUID1_ECX_POPCNT  (1<<23)
+%define FLAG_CPUID1_ECX_AESNI   (1<<25)
+%define FLAG_CPUID1_ECX_OSXSAVE (1<<27)
+%define FLAG_CPUID1_ECX_AVX     (1<<28)
+%define FLAG_CPUID1_EBX_AVX2    (1<<5)
+%define FLAG_XGETBV_EAX_XMM_YMM	0x6
+
+%define FLAG_CPUID1_EAX_AVOTON 0x000406d0
+
+; define d and w variants for registers
+
+%define	raxd	eax
+%define raxw	ax
+%define raxb	al
+
+%define	rbxd	ebx
+%define rbxw	bx
+%define rbxb	bl
+
+%define	rcxd	ecx
+%define rcxw	cx
+%define rcxb	cl
+
+%define	rdxd	edx
+%define rdxw	dx
+%define rdxb	dl
+
+%define	rsid	esi
+%define rsiw	si
+%define rsib	sil
+
+%define	rdid	edi
+%define rdiw	di
+%define rdib	dil
+
+%define	rbpd	ebp
+%define rbpw	bp
+%define rbpb	bpl
+
+%define ymm0x xmm0
+%define ymm1x xmm1
+%define ymm2x xmm2
+%define ymm3x xmm3
+%define ymm4x xmm4
+%define ymm5x xmm5
+%define ymm6x xmm6
+%define ymm7x xmm7
+%define ymm8x xmm8
+%define ymm9x xmm9
+%define ymm10x xmm10
+%define ymm11x xmm11
+%define ymm12x xmm12
+%define ymm13x xmm13
+%define ymm14x xmm14
+%define ymm15x xmm15
+
+%define DWORD(reg) reg %+ d
+%define WORD(reg)  reg %+ w
+%define BYTE(reg)  reg %+ b
+
+%define XWORD(reg) reg %+ x
+
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__, macho64
+%define elf64 macho64
+%endif
+
+%macro slversion 4
+	section .text
+	global %1_slver_%2%3%4
+	global %1_slver
+	%1_slver:
+	%1_slver_%2%3%4:
+		dw 0x%4
+		db 0x%3, 0x%2
+%endmacro
+
+%endif ; ifndef _REG_SIZES_ASM_
--- a/include/test.h
+++ b/include/test.h
@ -0,0 +1,81 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _TEST_H
+#define _TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Use sys/time.h functions for time
+
+#include <sys/time.h>
+
+struct perf{
+	struct timeval tv;
+};
+
+
+inline int perf_start(struct perf *p)
+{
+	return gettimeofday(&(p->tv), 0);
+}
+inline int perf_stop(struct perf *p)
+{
+	return gettimeofday(&(p->tv), 0);
+}
+
+inline void perf_print(struct perf stop, struct perf start, long long dsize)
+{
+	long long secs = stop.tv.tv_sec - start.tv.tv_sec;
+	long long usecs = secs * 1000000 + stop.tv.tv_usec - start.tv.tv_usec;
+
+	printf("runtime = %10lld usecs", usecs);
+	if (dsize != 0) {
+#if 1 // not bug in printf for 32-bit
+		printf(", bandwidth %lld MB in %.4f sec = %.2f MB/s\n", dsize/(1024*1024), 
+			((double) usecs)/1000000, ((double) dsize) / (double)usecs);
+#else
+		printf(", bandwidth %lld MB ", dsize/(1024*1024));
+		printf("in %.4f sec ",(double)usecs/1000000);
+		printf("= %.2f MB/s\n", (double)dsize/usecs);
+#endif
+	}
+	else
+		printf("\n");
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _TEST_H
--- a/include/types.h
+++ b/include/types.h
@ -0,0 +1,88 @@
+/**********************************************************************
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions 
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ *  @file  types.h
+ *  @brief Defines standard width types.
+ *
+ */
+
+#ifndef __TYPES_H
+#define __TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __WIN32__
+#ifdef __MINGW32__
+# include <_mingw.h>
+#endif
+typedef unsigned __int64 UINT64;
+typedef          __int64  INT64;
+typedef unsigned __int32 UINT32;
+typedef unsigned __int16 UINT16;
+typedef unsigned char    UINT8;
+#else
+typedef unsigned long int  UINT64;
+typedef          long int   INT64;
+typedef unsigned int       UINT32;
+typedef unsigned short int UINT16;
+typedef unsigned char      UINT8;
+#endif
+
+
+#if defined  __unix__ || defined __APPLE__
+# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
+# define __forceinline static inline
+# define aligned_free(x) free(x)
+#else
+# ifdef __MINGW32__
+#   define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
+#   define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+#   define aligned_free(x) _aligned_free(x)
+# else
+#   define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl
+#   define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+#   define aligned_free(x) _aligned_free(x)
+# endif
+#endif
+
+#ifdef DEBUG
+# define DEBUG_PRINT(x) printf x
+#else
+# define DEBUG_PRINT(x) do {} while (0)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  //__TYPES_H
--- a/isa-l.def
+++ b/isa-l.def
@ -0,0 +1,56 @@
+LIBRARY  isa-l
+EXPORTS
+
+ec_encode_data_sse      @1
+ec_init_tables          @2
+gf_gen_cauchy1_matrix   @3
+gf_gen_rs_matrix        @4
+gf_invert_matrix        @5
+gf_mul                  @6
+gf_vect_dot_prod_base   @7
+gf_vect_mul_base        @8
+ec_encode_data_base     @9
+gf_vect_mul_init        @10
+gf_vect_mul_sse         @11
+gf_vect_mul_avx         @12
+gf_vect_dot_prod_sse    @13
+gf_vect_dot_prod_avx    @14
+gf_vect_dot_prod_avx2   @15
+gf_2vect_dot_prod_sse   @16
+gf_3vect_dot_prod_sse   @17
+gf_4vect_dot_prod_sse   @18
+gf_5vect_dot_prod_sse   @19
+gf_6vect_dot_prod_sse   @20
+gf_2vect_dot_prod_avx   @21
+gf_3vect_dot_prod_avx   @22
+gf_4vect_dot_prod_avx   @23
+gf_5vect_dot_prod_avx   @24
+gf_6vect_dot_prod_avx   @25
+gf_2vect_dot_prod_avx2  @26
+gf_3vect_dot_prod_avx2  @27
+gf_4vect_dot_prod_avx2  @28
+gf_5vect_dot_prod_avx2  @29
+gf_6vect_dot_prod_avx2  @30
+gf_vect_mad_sse         @31
+gf_2vect_mad_sse        @32
+gf_3vect_mad_sse        @33
+gf_4vect_mad_sse        @34
+gf_5vect_mad_sse        @35
+gf_6vect_mad_sse        @36
+gf_vect_mad_avx         @37
+gf_2vect_mad_avx        @38
+gf_3vect_mad_avx        @39
+gf_4vect_mad_avx        @40
+gf_5vect_mad_avx        @41
+gf_6vect_mad_avx        @42
+gf_vect_mad_avx2        @43
+gf_2vect_mad_avx2       @44
+gf_3vect_mad_avx2       @45
+gf_4vect_mad_avx2       @46
+gf_5vect_mad_avx2       @47
+gf_6vect_mad_avx2       @48
+ec_encode_data          @49
+gf_vect_mul             @50
+ec_encode_data_update   @51
+gf_vect_dot_prod        @52
+gf_vect_mad             @53
--- a/libisal.pc.in
+++ b/libisal.pc.in
@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libisal
+Description: Library for storage systems
+Version: @VERSION@
+Libs: -L${libdir} -lisal
+Libs.private:
+Cflags: -I${includedir}
--- a/make.inc
+++ b/make.inc
@ -0,0 +1,246 @@
+########################################################################
+#  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions 
+#  are met:
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+
+# Makefile include for optimized libraries
+#  make targets:
+#	lib  - build library of optimized functions
+#	slib - build shared library
+#	test - run unit tests of functions
+#	perf - run performance tests
+#	install - install headers and libs to system location
+#	sim  - run on simulator
+#	trace - get simulator trace
+#	clean - remove object files
+
+version ?= #auto filled on release
+
+CC  = gcc
+AS  = yasm
+SIM = sde $(SIMFLAGS) --
+
+DEBUG      = -g
+DEBUG_yasm = -g dwarf2
+DEBUG_nasm = -g
+
+# Default arch= build options
+CFLAGS_gcc = -Wall
+ASFLAGS_   = -f elf64
+ARFLAGS_   = cr $@
+STRIP_gcc  = strip -d -R .comment $@
+
+# arch=32 build options
+ASFLAGS_32 = -f elf32
+CFLAGS_32  = -m32
+ARFLAGS_32 = cr $@
+
+# arch=win64 build options
+ASFLAGS_win64 = -f win64
+CFLAGS_icl    = -Qstd=c99
+ARFLAGS_win64 = -out:$@
+
+# arch=mingw build options
+ASFLAGS_mingw = -f win64
+ARFLAGS_mingw = cr $@
+lsrcmingw = $(lsrc)
+unit_testsmingw = $(unit_tests)
+examplesmingw = $(examples)
+perf_testsmingw = $(perf_tests)
+
+ifeq ($(arch),mingw)
+  CC=x86_64-w64-mingw32-gcc
+  AR=x86_64-w64-mingw32-ar
+  LDFLAGS = -Wl,--force-exe-suffix
+endif
+
+
+INCLUDE   = $(patsubst %,-I%,$(subst :, ,$(VPATH)))
+CFLAGS   = $(CFLAGS_$(arch)) $(CFLAGS_$(CC)) $(DEBUG) -O2 $(DEFINES) $(INCLUDE)
+ASFLAGS  = $(ASFLAGS_$(arch)) $(ASFLAGS_$(CC)) $(DEBUG_$(AS)) $(DEFINES) $(INCLUDE)
+ARFLAGS  = $(ARFLAGS_$(arch))
+DEFINES += $(addprefix -D , $D)
+
+O = bin
+lobj  += $(patsubst %.c,%.o,$(patsubst %.asm,%.o,$(lsrc$(arch)) $(lsrc_intrinsic)))
+objs  = $(addprefix $(O)/,$(notdir $(lobj)))
+
+
+lib_name ?= isa-l.a
+default: lib slib
+
+# Defaults for windows build
+ifeq ($(arch),win64)
+  AR=lib
+  CC=cl
+  OUTPUT_OPTION = -Fo$@
+  DEBUG=
+  lib_name := $(basename $(lib_name)).lib
+endif
+lsrcwin64 = $(lsrc)
+unit_testswin64 = $(unit_tests)
+exampleswin64 = $(examples)
+perf_testswin64 = $(perf_tests)
+
+# Build and run unit tests, performance tests, etc.
+all_tests = $(notdir $(sort $(perf_tests$(arch)) $(check_tests$(arch)) $(unit_tests$(arch)) $(examples$(arch)) $(other_tests)))
+all_unit_tests = $(notdir $(sort $(check_tests$(arch)) $(unit_tests$(arch))))
+
+$(all_unit_tests): % : %.c $(lib_name)
+$(sort $(notdir $(perf_tests$(arch)))): % : %.c $(lib_name)
+$(sort $(examples$(arch))): % : %.c $(lib_name)
+$(sort $(other_tests)): % : %.c $(lib_name)
+
+sim test trace: $(addsuffix .run,$(all_unit_tests))
+
+perf: $(addsuffix .run,$(notdir $(perf_tests$(arch))))
+ex:   $(examples$(arch))
+all: lib $(all_tests)
+other: $(other_tests)
+tests: $(all_unit_tests)
+perfs: $(notdir $(perf_tests$(arch)))
+check test perf: SIM=
+trace: SIMFLAGS = -debugtrace
+check test sim:
+	@echo Finished running $@
+
+#$(foreach c, $^, ./$c )
+#for i in $^; do ./$$i ; done
+
+$(objs): | $(O)
+$(O): ; mkdir -p $(O)
+
+
+# Build rule to run tests
+%.run: %
+	$(SIM) $(@D)/$<
+	@echo Completed run: $<
+
+# Other build rules
+msg = $(if $(DEBUG),DEBUG) $(patsubst 32,32-bit,$(arch)) $D
+
+$(O)/%.o: %.asm
+	@echo "  ---> Building $< $(msg)"
+	@$(AS) $(ASFLAGS) -o $@ $<
+
+$(O)/%.o %.o: %.c
+	@echo "  ---> Building $< $(msg)"
+	@$(COMPILE.c) $(OUTPUT_OPTION) $<
+
+$(all_tests):
+	@echo "  ---> Building Test $@ $(msg)"
+	@$(LINK.o) $(CFLAGS)  $^ $(LDLIBS) -o $@
+
+
+# Target to build lib files
+lib: $(lib_name)
+ifneq ($(lib_debug),1)
+ $(lib_name): DEBUG_$(AS)=	# Don't put debug symbols in the lib
+ $(lib_name): DEBUG=
+ $(lib_name): DEFINES+=-D NDEBUG
+endif
+ifeq ($(lib_debug),1)
+ DEBUG+=-D DEBUG   # Define DEBUG for macros
+endif
+
+#lib $(lib_name): $(lib_name)(${objs})
+$(lib_name): $(objs)
+	@echo "  ---> Creating Lib $@"
+	@$(AR) $(ARFLAGS) $^
+	@$(STRIP_$(CC))
+
+
+# Target for shared lib
+so_lib_name = bin/libisal.so
+so_lib_inst = $(notdir $(so_lib_name))
+so_lib_ver  = $(so_lib_inst).$(version)
+soname      = $(so_lib_inst).$(word 1, $(subst ., ,$(version)))
+
+slib: $(so_lib_name)
+aobjs  += $(addprefix $(O)/,$(patsubst %.asm,%.o,$(filter %.asm,$(notdir $(lsrc$(arch)) $(lsrc_intrinsic)))))
+shared_objs  += $(addprefix $(O)/shared_ver_,$(patsubst %.c,%.o,$(filter %.c,$(notdir $(lsrc$(arch)) $(lsrc_intrinsic)))))
+
+$(O)/shared_ver_%.o: %.c
+	@echo "  ---> Building shared $< $(msg)"
+	@$(COMPILE.c) $(OUTPUT_OPTION) $<
+
+ifneq ($(lib_debug),1)
+ $(so_lib_name): DEBUG_$(AS)=
+ $(so_lib_name): DEBUG=
+ $(so_lib_name): DEFINES+=-D NDEBUG
+endif
+
+$(shared_objs): CFLAGS += -fPIC
+$(shared_objs) $(aobjs): | $(O)
+$(so_lib_name): LDFLAGS+=-Wl,-soname,$(soname)
+$(so_lib_name): $(shared_objs) $(aobjs)
+	@echo "  ---> Creating Shared Lib $@"
+	@$(CC) $(CFLAGS) --shared  $(LDFLAGS) -o $@ $^
+	@(cd $(@D); ln -f -s $(so_lib_inst) $(soname))
+
+# Target for install
+prefix = /usr/local
+install_dirs = $(prefix)/lib $(prefix)/include/isa-l
+$(install_dirs): ; mkdir -p $@
+install: $(sort $(extern_hdrs)) | $(install_dirs) $(lib_name) $(so_lib_name) isa-l.h
+	install -m 644 $(lib_name) $(prefix)/lib/libisal.a
+	install -m 644 $^ $(prefix)/include/isa-l/.
+	install -m 664 include/isa-l.h $(prefix)/include/.
+	install -m 664 $(so_lib_name) $(prefix)/lib/$(so_lib_ver)
+	(cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(soname) && ln -f -s $(so_lib_ver) $(so_lib_inst))
+ifeq ($(shell uname),Darwin)
+	(cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(basename $(so_lib_inst)).dylib)
+endif
+	which libtool && libtool --mode=finish $(prefix)/lib || \
+	echo 'Lib installed at $(prefix)/lib. Run system-dependent programs to add shared lib path.'
+
+uninstall:
+	$(RM) $(prefix)/lib/libisal.a
+	$(RM) $(prefix)/lib/$(soname)
+	$(RM) $(prefix)/lib/$(so_lib_ver)
+	$(RM) $(prefix)/lib/$(so_lib_inst)
+	$(RM) -r $(prefix)/include/isa-l
+	$(RM) $(prefix)/include/isa-l.h
+	$(RM) $(prefix)/lib/$(basename $(so_lib_inst)).dylib
+
+# Collect performance data
+rpt_name = perf_report_$(shell uname -n)_$(shell date +%y%m%d).perf
+
+perf_report:
+	echo Results for $(rpt_name) >> $(rpt_name)
+	$(MAKE) -k perf | tee -a $(rpt_name)
+	@echo Summary:
+	-grep runtime $(rpt_name)
+
+
+clean:
+	@echo Cleaning up
+	@$(RM) -r $(O) *.o *.a $(all_tests) $(lib_name) $(so_lib_name)
+
+
+
--- a/tools/yasm-filter.sh
+++ b/tools/yasm-filter.sh
@ -0,0 +1,31 @@
+#/bin/sh
+
+# Filter out unnecessary options added by automake
+
+while [ -n "$*" ]; do
+    case "$1" in
+	-f | -o | -I | -i | -D )
+	    # Supported options with arg
+	    options="$options $1 $2"
+	    shift
+	    shift
+	    ;;
+	-I* | -i* | --prefix* )
+	    # Supported options without arg
+	    options="$options $1"
+	    shift
+	    ;;
+	#-blah )
+	# Unsupported options with args - none known
+	-* )
+	    # Unsupported options with no args
+	    shift
+	    ;;
+	* )
+	    args="$args $1"
+	    shift
+	    ;;
+    esac
+done
+
+yasm $options $args