Initial commit isa-l v2.14.1

Signed-off-by: Greg Tucker <greg.b.tucker@intel.com>
This commit is contained in:
Greg Tucker 2015-10-22 14:54:34 -07:00
commit 00c1efc109
96 changed files with 35658 additions and 0 deletions

26
About_bsd.txt Normal file
View File

@ -0,0 +1,26 @@
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

100
Makefile.am Normal file
View File

@ -0,0 +1,100 @@
EXTRA_DIST = autogen.sh Makefile.unx make.inc Makefile.nmake isa-l.def About_bsd.txt
CLEANFILES =
LDADD =
AM_MAKEFLAGS = --no-print-directory
noinst_HEADERS =
pkginclude_HEADERS = include/test.h
noinst_LTLIBRARIES =
INCLUDE = -I $(srcdir)/include
AM_CFLAGS = ${my_CFLAGS} ${INCLUDE} ${D}
lsrc=
extern_hdrs=
other_src=
check_tests=
unit_tests=
perf_tests=
unit_tests_extra=
perf_tests_extra=
examples=
other_tests=
lsrc32=
unit_tests32=
perf_tests32=
# Include units
include erasure_code/Makefile.am
# LIB version info not necessarily the same as package version
LIBISAL_CURRENT=2
LIBISAL_REVISION=14
LIBISAL_AGE=0
lib_LTLIBRARIES = libisal.la
pkginclude_HEADERS += $(sort ${extern_hdrs})
libisal_la_SOURCES = ${lsrc}
nobase_include_HEADERS = isa-l.h
libisal_la_LDFLAGS = $(AM_LDFLAGS) \
-version-info $(LIBISAL_CURRENT):$(LIBISAL_REVISION):$(LIBISAL_AGE)
libisal_la_LIBADD = ${noinst_LTLIBRARIES}
EXTRA_DIST += ${other_src}
EXTRA_DIST += Release_notes.txt
# For tests
LDADD += libisal.la
check_PROGRAMS = ${check_tests}
TESTS = ${check_tests}
# For additional tests
EXTRA_PROGRAMS = ${unit_tests}
EXTRA_PROGRAMS += ${perf_tests}
EXTRA_PROGRAMS += ${other_tests}
EXTRA_PROGRAMS += ${examples}
CLEANFILES += ${EXTRA_PROGRAMS}
perfs: ${perf_tests}
tests: ${unit_tests}
other: ${other_tests}
perf: $(addsuffix .run,$(perf_tests))
ex: ${examples}
test: $(addsuffix .run,$(unit_tests))
# Build rule to run tests
%.run: %
$<
@echo Completed run: $<
# Support for yasm
CCAS = ${srcdir}/tools/yasm-filter.sh
EXTRA_DIST += tools/yasm-filter.sh
AM_CCASFLAGS = ${yasm_args} ${INCLUDE}
.asm.s:
@echo " MKTMP " $@;
@cp $< $@
# Generate isa-l.h
BUILT_SOURCES = isa-l.h
CLEANFILES += isa-l.h
isa-l.h:
@echo 'Building $@'
@echo '' >> $@
@echo '#ifndef _ISAL_H_' >> $@
@echo '#define _ISAL_H_' >> $@
@echo '' >> $@
@echo '#define.ISAL_MAJOR_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$3}' >> $@
@echo '#define.ISAL_MINOR_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$4}' >> $@
@echo '#define.ISAL_PATCH_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$5}' >> $@
@echo '#define ISAL_MAKE_VERSION(maj, min, patch) ((maj) * 0x10000 + (min) * 0x100 + (patch))' >> $@
@echo '#define ISAL_VERSION ISAL_MAKE_VERSION(ISAL_MAJOR_VERSION, ISAL_MINOR_VERSION, ISAL_PATCH_VERSION)' >> $@
@echo '' >> $@
@for unit in ${extern_hdrs}; do echo "#include <isa-l/$$unit>" | sed -e 's;include/;;' >> $@; done
@echo '#endif //_ISAL_H_' >> $@
license = bsd
licc = $(srcdir)/doc/license_$(license)_c.txt
lica = $(srcdir)/doc/license_$(license)_asm.txt
licm = $(srcdir)/doc/license_$(license)_make.txt

88
Makefile.nmake Normal file
View File

@ -0,0 +1,88 @@
########################################################################
# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Intel Corporation nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
########################################################################
objs = bin\ec_base.obj bin\ec_highlevel_func.obj bin\ec_multibinary.obj bin\gf_2vect_dot_prod_avx.obj bin\gf_2vect_dot_prod_avx2.obj bin\gf_2vect_dot_prod_sse.obj bin\gf_2vect_mad_avx.obj bin\gf_2vect_mad_avx2.obj bin\gf_2vect_mad_sse.obj bin\gf_3vect_dot_prod_avx.obj bin\gf_3vect_dot_prod_avx2.obj bin\gf_3vect_dot_prod_sse.obj bin\gf_3vect_mad_avx.obj bin\gf_3vect_mad_avx2.obj bin\gf_3vect_mad_sse.obj bin\gf_4vect_dot_prod_avx.obj bin\gf_4vect_dot_prod_avx2.obj bin\gf_4vect_dot_prod_sse.obj bin\gf_4vect_mad_avx.obj bin\gf_4vect_mad_avx2.obj bin\gf_4vect_mad_sse.obj bin\gf_5vect_dot_prod_avx.obj bin\gf_5vect_dot_prod_avx2.obj bin\gf_5vect_dot_prod_sse.obj bin\gf_5vect_mad_avx.obj bin\gf_5vect_mad_avx2.obj bin\gf_5vect_mad_sse.obj bin\gf_6vect_dot_prod_avx.obj bin\gf_6vect_dot_prod_avx2.obj bin\gf_6vect_dot_prod_sse.obj bin\gf_6vect_mad_avx.obj bin\gf_6vect_mad_avx2.obj bin\gf_6vect_mad_sse.obj bin\gf_vect_dot_prod_avx.obj bin\gf_vect_dot_prod_avx2.obj bin\gf_vect_dot_prod_sse.obj bin\gf_vect_mad_avx.obj bin\gf_vect_mad_avx2.obj bin\gf_vect_mad_sse.obj bin\gf_vect_mul_avx.obj bin\gf_vect_mul_sse.obj
INCLUDES = -I. -Ierasure_code -Iinclude
LINKFLAGS = /nologo
CFLAGS = -O2 -D NDEBUG /nologo -D_USE_MATH_DEFINES -Qstd=c99 $(INCLUDES) $(D)
AFLAGS = -f win64 $(INCLUDES) $(D)
CC = icl
AS = yasm
lib: bin static dll
static: bin isa-l_static.lib
dll: bin isa-l.dll
bin: ; -mkdir $@
isa-l_static.lib: $(objs)
lib -out:$@ $?
isa-l.dll: $(objs)
link -out:$@ -dll -def:isa-l.def $?
{erasure_code}.c.obj:
$(CC) $(CFLAGS) /c -Fo$@ $?
{erasure_code}.asm.obj:
$(AS) $(AFLAGS) -o $@ $?
.obj.exe:
link /out:$@ $(LINKFLAGS) isa-l.lib $?
# Check tests
checks = erasure_code_test.exe erasure_code_update_test.exe gf_inverse_test.exe gf_vect_mul_test.exe
checks: lib $(checks)
$(checks): $(@B).obj
check: $(checks)
!$?
# Unit tests
tests = erasure_code_base_test.exe erasure_code_sse_test.exe gf_2vect_dot_prod_sse_test.exe gf_3vect_dot_prod_sse_test.exe gf_4vect_dot_prod_sse_test.exe gf_5vect_dot_prod_sse_test.exe gf_6vect_dot_prod_sse_test.exe gf_vect_dot_prod_avx_test.exe gf_vect_dot_prod_base_test.exe gf_vect_dot_prod_sse_test.exe gf_vect_dot_prod_test.exe gf_vect_mad_test.exe gf_vect_mul_avx_test.exe gf_vect_mul_base_test.exe gf_vect_mul_sse_test.exe
tests: lib $(tests)
$(tests): $(@B).obj
# Performance tests
perfs = erasure_code_base_perf.exe erasure_code_perf.exe erasure_code_sse_perf.exe erasure_code_update_perf.exe gf_2vect_dot_prod_sse_perf.exe gf_3vect_dot_prod_sse_perf.exe gf_4vect_dot_prod_sse_perf.exe gf_5vect_dot_prod_sse_perf.exe gf_6vect_dot_prod_sse_perf.exe gf_vect_dot_prod_1tbl.exe gf_vect_dot_prod_avx_perf.exe gf_vect_dot_prod_perf.exe gf_vect_dot_prod_sse_perf.exe gf_vect_mad_perf.exe gf_vect_mul_avx_perf.exe gf_vect_mul_perf.exe gf_vect_mul_sse_perf.exe
perfs: lib $(perfs)
$(perfs): $(@B).obj
clean:
-if exist *.obj del *.obj
-if exist bin\*.obj del bin\*.obj
-if exist isa-l_static.lib del isa-l_static.lib
-if exist *.exe del *.exe
-if exist isa-l.lib del isa-l.lib
-if exist isa-l.dll del isa-l.dll

41
Makefile.unx Normal file
View File

@ -0,0 +1,41 @@
########################################################################
# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Intel Corporation nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
########################################################################
units = erasure_code
default: lib
include $(foreach unit,$(units), $(unit)/Makefile.am)
# Override individual lib names to make one inclusive library.
lib_name := bin/isa-l.a
include make.inc
VPATH = . $(units) include

19
README Normal file
View File

@ -0,0 +1,19 @@
=================================================
Intel(R) Intelligent Storage Acceleration Library
=================================================
Build Prerequisites
===================
ISA-L requires yasm version 1.2 or later.
Building ISA-L
==============
To build and install the library it is usually sufficient to run the following.
./configure
make
sudo make install
Other targets include: make check, make tests and make perfs.

74
Release_notes.txt Normal file
View File

@ -0,0 +1,74 @@
=============================================================================
v2.14 Intel Intelligent Storage Acceleration Library Release Notes
Open Source Version
=============================================================================
=============================================================================
RELEASE NOTE CONTENTS
=============================================================================
1. KNOWN ISSUES
2. FIXED ISSUES
3. CHANGE LOG & FEATURES ADDED
=============================================================================
1. KNOWN ISSUES
=============================================================================
* Only erasure code unit included in open source version at this time.
* Perf tests do not run in Windows environment.
* 32-bit lib is not supported in Windows.
=============================================================================
2. FIXED ISSUES
=============================================================================
v2.14
* Building in unit directories is no longer supported removing the issue of
leftover object files causing the top-level make build to fail.
v2.10
* Fix for windows register save overlap in gf_{3-6}vect_dot_prod_sse.asm. Only
affects windows versions of erasure code. GP register saves/restore were
pushed to same stack area as XMM.
=============================================================================
3. CHANGE LOG & FEATURES ADDED
=============================================================================
v2.14
* Autoconf and autotools build allows easier porting to additional systems.
Previous make system still available to embedded users with Makefile.unx.
* Includes update for building on Mac OS X/darwin systems. Add --target=darwin
to ./configure step.
v2.13
* Erasure code improvments
- 32-bit port of optimized gf_vect_dot_prod() functions. This makes
ec_encode_data() functions much faster on 32-bit processors.
- Avoton performance improvements. Performance on Avoton for
gf_vect_dot_prod() and ec_encode_data() can improve by as much as 20%.
v2.11
* Incremental erasure code. New functions added to erasure code to handle
single source update of code blocks. The function ec_encode_data_update()
works with parameters similar to ec_encode_data() but are called incrementally
with each source block. These versions are useful when source blocks are not
all available at once.
v2.10
* Erasure code updates
- New AVX and AVX2 support functions.
- Changes min len requirement on gf_vect_dot_prod() to 32 from 16.
- Tests include both source and parity recovery with ec_encode_data().
- New encoding examples with Vandermonde or Cauchy matrix.
v2.8
* First open release of erasure code unit that is part of ISA-L.

17
autogen.sh Executable file
View File

@ -0,0 +1,17 @@
#!/bin/sh -e
autoreconf --install --symlink -f
libdir() {
echo $(cd $1/$(gcc -print-multi-os-directory); pwd)
}
args="--prefix=/usr --libdir=$(libdir /usr/lib)"
echo
echo "----------------------------------------------------------------"
echo "Initialized build system. For a common configuration please run:"
echo "----------------------------------------------------------------"
echo
echo "./configure $args"
echo

112
configure.ac Normal file
View File

@ -0,0 +1,112 @@
# -*- Autoconf -*-
# Process this file with autoconf to produce a configure script.
AC_PREREQ(2.69)
AC_INIT([libisal],
[2.14.0],
[sg.support.isal@intel.com],
[isa-l],
[http://01.org/storage-acceleration-library])
AC_CONFIG_SRCDIR([])
AC_CONFIG_AUX_DIR([build-aux])
AM_INIT_AUTOMAKE([
foreign
1.11
-Wall
-Wno-portability
silent-rules
tar-pax
no-dist-gzip
dist-xz
subdir-objects
])
AM_PROG_AS
# Check for programs
AC_PROG_CC_STDC
AC_USE_SYSTEM_EXTENSIONS
AM_SILENT_RULES([yes])
LT_INIT
AC_PREFIX_DEFAULT([/usr])
AC_PROG_SED
AC_PROG_MKDIR_P
AC_CHECK_PROG(HAVE_YASM, yasm, yes, no)
if test "$HAVE_YASM" = "no"; then
AC_MSG_ERROR([yasm not found as required.])
fi
AC_MSG_CHECKING([checking for modern yasm])
AC_LANG_CONFTEST([AC_LANG_SOURCE([[vmovdqa %xmm0, %xmm1;]])])
if yasm -f elf64 -p gas conftest.c ; then
AC_MSG_RESULT([yes])
else
AC_MSG_FAILURE([need modern yasm])
fi
# Options
AC_ARG_ENABLE([debug],
AS_HELP_STRING([--enable-debug], [enable debug messages @<:@default=disabled@:>@]),
[], [enable_debug=no])
AS_IF([test "x$enable_debug" = "xyes"], [
AC_DEFINE(ENABLE_DEBUG, [1], [Debug messages.])
])
case $target in
*linux*) arch=linux yasm_args="-f elf64";;
*darwin*) arch=darwin yasm_args="-f macho64 --prefix=_ ";;
*netbsd*) arch=netbsd yasm_args="-f elf64";;
*) arch=unknown yasm_args="-f elf64";;
esac
AC_SUBST([yasm_args])
AM_CONDITIONAL(DARWIN, test x"$arch" = x"darwin")
AC_MSG_RESULT([Using yasm args target "$arch" "$yasm_args"])
# Check for header files
#AC_CHECK_HEADERS([limits.h stddef.h stdint.h stdlib.h string.h sys/time.h unistd.h])
AC_CHECK_HEADERS([limits.h stdint.h stdlib.h string.h])
# Checks for typedefs, structures, and compiler characteristics.
AC_C_INLINE
AC_TYPE_SIZE_T
AC_TYPE_UINT16_T
AC_TYPE_UINT32_T
AC_TYPE_UINT64_T
AC_TYPE_UINT8_T
# Checks for library functions.
AC_FUNC_MALLOC # Used only in tests
AC_CHECK_FUNCS([memmove memset])
my_CFLAGS="\
-Wall \
-Wchar-subscripts \
-Wformat-security \
-Wnested-externs \
-Wpointer-arith \
-Wshadow \
-Wstrict-prototypes \
-Wtype-limits \
"
AC_SUBST([my_CFLAGS])
AC_CONFIG_FILES([\
Makefile\
libisal.pc
])
AC_OUTPUT
AC_MSG_RESULT([
$PACKAGE $VERSION
=====
prefix: ${prefix}
sysconfdir: ${sysconfdir}
libdir: ${libdir}
includedir: ${includedir}
compiler: ${CC}
cflags: ${CFLAGS}
ldflags: ${LDFLAGS}
debug: ${enable_debug}
])

159
erasure_code/Makefile.am Normal file
View File

@ -0,0 +1,159 @@
########################################################################
# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Intel Corporation nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
########################################################################
lsrc += erasure_code/ec_highlevel_func.c \
erasure_code/ec_base.c \
erasure_code/gf_vect_mul_sse.asm \
erasure_code/gf_vect_mul_avx.asm \
erasure_code/gf_vect_dot_prod_sse.asm \
erasure_code/gf_vect_dot_prod_avx.asm \
erasure_code/gf_vect_dot_prod_avx2.asm \
erasure_code/gf_2vect_dot_prod_sse.asm \
erasure_code/gf_3vect_dot_prod_sse.asm \
erasure_code/gf_4vect_dot_prod_sse.asm \
erasure_code/gf_5vect_dot_prod_sse.asm \
erasure_code/gf_6vect_dot_prod_sse.asm \
erasure_code/gf_2vect_dot_prod_avx.asm \
erasure_code/gf_3vect_dot_prod_avx.asm \
erasure_code/gf_4vect_dot_prod_avx.asm \
erasure_code/gf_5vect_dot_prod_avx.asm \
erasure_code/gf_6vect_dot_prod_avx.asm \
erasure_code/gf_2vect_dot_prod_avx2.asm \
erasure_code/gf_3vect_dot_prod_avx2.asm \
erasure_code/gf_4vect_dot_prod_avx2.asm \
erasure_code/gf_5vect_dot_prod_avx2.asm \
erasure_code/gf_6vect_dot_prod_avx2.asm \
erasure_code/gf_vect_mad_sse.asm \
erasure_code/gf_2vect_mad_sse.asm \
erasure_code/gf_3vect_mad_sse.asm \
erasure_code/gf_4vect_mad_sse.asm \
erasure_code/gf_5vect_mad_sse.asm \
erasure_code/gf_6vect_mad_sse.asm \
erasure_code/gf_vect_mad_avx.asm \
erasure_code/gf_2vect_mad_avx.asm \
erasure_code/gf_3vect_mad_avx.asm \
erasure_code/gf_4vect_mad_avx.asm \
erasure_code/gf_5vect_mad_avx.asm \
erasure_code/gf_6vect_mad_avx.asm \
erasure_code/gf_vect_mad_avx2.asm \
erasure_code/gf_2vect_mad_avx2.asm \
erasure_code/gf_3vect_mad_avx2.asm \
erasure_code/gf_4vect_mad_avx2.asm \
erasure_code/gf_5vect_mad_avx2.asm \
erasure_code/gf_6vect_mad_avx2.asm \
erasure_code/ec_multibinary.asm
lsrc32 += erasure_code/ec_highlevel_func.c \
erasure_code/ec_multibinary.asm \
erasure_code/ec_base.c \
erasure_code/gf_vect_dot_prod_avx.asm \
erasure_code/gf_2vect_dot_prod_avx.asm \
erasure_code/gf_3vect_dot_prod_avx.asm \
erasure_code/gf_4vect_dot_prod_avx.asm \
erasure_code/gf_vect_dot_prod_sse.asm \
erasure_code/gf_2vect_dot_prod_sse.asm \
erasure_code/gf_3vect_dot_prod_sse.asm \
erasure_code/gf_4vect_dot_prod_sse.asm \
erasure_code/gf_vect_dot_prod_avx2.asm \
erasure_code/gf_2vect_dot_prod_avx2.asm \
erasure_code/gf_3vect_dot_prod_avx2.asm \
erasure_code/gf_4vect_dot_prod_avx2.asm
unit_tests32 += erasure_code_base_test \
erasure_code/erasure_code_test \
erasure_code/erasure_code_sse_test \
erasure_code/gf_vect_mul_test \
erasure_code/gf_vect_mul_base_test \
erasure_code/gf_vect_dot_prod_base_test \
erasure_code/gf_vect_dot_prod_test \
erasure_code/gf_vect_dot_prod_avx_test \
erasure_code/gf_vect_dot_prod_sse_test \
erasure_code/gf_2vect_dot_prod_sse_test \
erasure_code/gf_3vect_dot_prod_sse_test \
erasure_code/gf_4vect_dot_prod_sse_test
perf_tests32 += erasure_code/gf_vect_mul_perf \
erasure_code/gf_vect_dot_prod_perf \
erasure_code/erasure_code_perf \
erasure_code/erasure_code_base_perf \
erasure_code/erasure_code_sse_perf \
erasure_code/gf_vect_dot_prod_1tbl \
erasure_code/gf_vect_dot_prod_avx_perf\
erasure_code/gf_vect_dot_prod_sse_perf\
erasure_code/gf_2vect_dot_prod_sse_perf \
erasure_code/gf_3vect_dot_prod_sse_perf \
erasure_code/gf_4vect_dot_prod_sse_perf
extern_hdrs += include/erasure_code.h \
include/gf_vect_mul.h
other_src += erasure_code/ec_base.h \
include/reg_sizes.asm
check_tests += erasure_code/gf_vect_mul_test \
erasure_code/erasure_code_test \
erasure_code/gf_inverse_test \
erasure_code/erasure_code_update_test
unit_tests += erasure_code/gf_vect_mul_sse_test \
erasure_code/gf_vect_mul_avx_test \
erasure_code/gf_vect_mul_base_test \
erasure_code/gf_vect_dot_prod_sse_test \
erasure_code/gf_vect_dot_prod_avx_test \
erasure_code/gf_2vect_dot_prod_sse_test \
erasure_code/gf_3vect_dot_prod_sse_test \
erasure_code/gf_4vect_dot_prod_sse_test \
erasure_code/gf_5vect_dot_prod_sse_test \
erasure_code/gf_6vect_dot_prod_sse_test \
erasure_code/gf_vect_dot_prod_base_test \
erasure_code/gf_vect_dot_prod_test \
erasure_code/gf_vect_mad_test \
erasure_code/erasure_code_base_test \
erasure_code/erasure_code_sse_test
perf_tests += erasure_code/gf_vect_mul_perf \
erasure_code/gf_vect_mul_sse_perf \
erasure_code/gf_vect_mul_avx_perf \
erasure_code/gf_vect_dot_prod_sse_perf \
erasure_code/gf_vect_dot_prod_avx_perf \
erasure_code/gf_2vect_dot_prod_sse_perf \
erasure_code/gf_3vect_dot_prod_sse_perf \
erasure_code/gf_4vect_dot_prod_sse_perf \
erasure_code/gf_5vect_dot_prod_sse_perf \
erasure_code/gf_6vect_dot_prod_sse_perf \
erasure_code/gf_vect_dot_prod_perf \
erasure_code/gf_vect_dot_prod_1tbl \
erasure_code/gf_vect_mad_perf \
erasure_code/erasure_code_perf \
erasure_code/erasure_code_base_perf \
erasure_code/erasure_code_sse_perf \
erasure_code/erasure_code_update_perf
other_src += include/test.h \
include/types.h

360
erasure_code/ec_base.c Normal file
View File

@ -0,0 +1,360 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <limits.h>
#include <string.h> // for memset
#include "erasure_code.h"
#include "ec_base.h" // for GF tables
#include "types.h"
unsigned char gf_mul(unsigned char a, unsigned char b)
{
#ifndef GF_LARGE_TABLES
int i;
if ((a == 0) || (b == 0))
return 0;
return gff_base[(i = gflog_base[a] + gflog_base[b]) > 254 ? i - 255 : i];
#else
return gf_mul_table_base[b * 256 + a];
#endif
}
unsigned char gf_inv(unsigned char a)
{
#ifndef GF_LARGE_TABLES
if (a == 0)
return 0;
return gff_base[255 - gflog_base[a]];
#else
return gf_inv_table_base[a];
#endif
}
void gf_gen_rs_matrix(unsigned char *a, int m, int k)
{
int i, j;
unsigned char p, gen = 1;
memset(a, 0, k * m);
for (i = 0; i < k; i++)
a[k * i + i] = 1;
for (i = k; i < m; i++) {
p = 1;
for (j = 0; j < k; j++) {
a[k * i + j] = p;
p = gf_mul(p, gen);
}
gen = gf_mul(gen, 2);
}
}
void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k)
{
int i, j;
unsigned char *p;
// Identity matrix in high position
memset(a, 0, k * m);
for (i = 0; i < k; i++)
a[k * i + i] = 1;
// For the rest choose 1/(i + j) | i != j
p = &a[k * k];
for (i = k; i < m; i++)
for (j = 0; j < k; j++)
*p++ = gf_inv(i ^ j);
}
int gf_invert_matrix(unsigned char *in_mat, unsigned char *out_mat, const int n)
{
int i, j, k;
unsigned char temp;
// Set out_mat[] to the identity matrix
for (i = 0; i < n * n; i++) // memset(out_mat, 0, n*n)
out_mat[i] = 0;
for (i = 0; i < n; i++)
out_mat[i * n + i] = 1;
// Inverse
for (i = 0; i < n; i++) {
// Check for 0 in pivot element
if (in_mat[i * n + i] == 0) {
// Find a row with non-zero in current column and swap
for (j = i + 1; j < n; j++)
if (in_mat[j * n + i])
break;
if (j == n) // Couldn't find means it's singular
return -1;
for (k = 0; k < n; k++) { // Swap rows i,j
temp = in_mat[i * n + k];
in_mat[i * n + k] = in_mat[j * n + k];
in_mat[j * n + k] = temp;
temp = out_mat[i * n + k];
out_mat[i * n + k] = out_mat[j * n + k];
out_mat[j * n + k] = temp;
}
}
temp = gf_inv(in_mat[i * n + i]); // 1/pivot
for (j = 0; j < n; j++) { // Scale row i by 1/pivot
in_mat[i * n + j] = gf_mul(in_mat[i * n + j], temp);
out_mat[i * n + j] = gf_mul(out_mat[i * n + j], temp);
}
for (j = 0; j < n; j++) {
if (j == i)
continue;
temp = in_mat[j * n + i];
for (k = 0; k < n; k++) {
out_mat[j * n + k] ^= gf_mul(temp, out_mat[i * n + k]);
in_mat[j * n + k] ^= gf_mul(temp, in_mat[i * n + k]);
}
}
}
return 0;
}
// Calculates const table gftbl in GF(2^8) from single input A
// gftbl(A) = {A{00}, A{01}, A{02}, ... , A{0f} }, {A{00}, A{10}, A{20}, ... , A{f0} }
void gf_vect_mul_init(unsigned char c, unsigned char *tbl)
{
unsigned char c2 = (c << 1) ^ ((c & 0x80) ? 0x1d : 0); //Mult by GF{2}
unsigned char c4 = (c2 << 1) ^ ((c2 & 0x80) ? 0x1d : 0); //Mult by GF{2}
unsigned char c8 = (c4 << 1) ^ ((c4 & 0x80) ? 0x1d : 0); //Mult by GF{2}
#if __WORDSIZE == 64 || _WIN64 || __x86_64__
unsigned long long v1, v2, v4, v8, *t;
unsigned long long v10, v20, v40, v80;
unsigned char c17, c18, c20, c24;
t = (unsigned long long *)tbl;
v1 = c * 0x0100010001000100ull;
v2 = c2 * 0x0101000001010000ull;
v4 = c4 * 0x0101010100000000ull;
v8 = c8 * 0x0101010101010101ull;
v4 = v1 ^ v2 ^ v4;
t[0] = v4;
t[1] = v8 ^ v4;
c17 = (c8 << 1) ^ ((c8 & 0x80) ? 0x1d : 0); //Mult by GF{2}
c18 = (c17 << 1) ^ ((c17 & 0x80) ? 0x1d : 0); //Mult by GF{2}
c20 = (c18 << 1) ^ ((c18 & 0x80) ? 0x1d : 0); //Mult by GF{2}
c24 = (c20 << 1) ^ ((c20 & 0x80) ? 0x1d : 0); //Mult by GF{2}
v10 = c17 * 0x0100010001000100ull;
v20 = c18 * 0x0101000001010000ull;
v40 = c20 * 0x0101010100000000ull;
v80 = c24 * 0x0101010101010101ull;
v40 = v10 ^ v20 ^ v40;
t[2] = v40;
t[3] = v80 ^ v40;
#else // 32-bit or other
unsigned char c3, c5, c6, c7, c9, c10, c11, c12, c13, c14, c15;
unsigned char c17, c18, c19, c20, c21, c22, c23, c24, c25, c26, c27, c28, c29, c30,
c31;
c3 = c2 ^ c;
c5 = c4 ^ c;
c6 = c4 ^ c2;
c7 = c4 ^ c3;
c9 = c8 ^ c;
c10 = c8 ^ c2;
c11 = c8 ^ c3;
c12 = c8 ^ c4;
c13 = c8 ^ c5;
c14 = c8 ^ c6;
c15 = c8 ^ c7;
tbl[0] = 0;
tbl[1] = c;
tbl[2] = c2;
tbl[3] = c3;
tbl[4] = c4;
tbl[5] = c5;
tbl[6] = c6;
tbl[7] = c7;
tbl[8] = c8;
tbl[9] = c9;
tbl[10] = c10;
tbl[11] = c11;
tbl[12] = c12;
tbl[13] = c13;
tbl[14] = c14;
tbl[15] = c15;
c17 = (c8 << 1) ^ ((c8 & 0x80) ? 0x1d : 0); //Mult by GF{2}
c18 = (c17 << 1) ^ ((c17 & 0x80) ? 0x1d : 0); //Mult by GF{2}
c19 = c18 ^ c17;
c20 = (c18 << 1) ^ ((c18 & 0x80) ? 0x1d : 0); //Mult by GF{2}
c21 = c20 ^ c17;
c22 = c20 ^ c18;
c23 = c20 ^ c19;
c24 = (c20 << 1) ^ ((c20 & 0x80) ? 0x1d : 0); //Mult by GF{2}
c25 = c24 ^ c17;
c26 = c24 ^ c18;
c27 = c24 ^ c19;
c28 = c24 ^ c20;
c29 = c24 ^ c21;
c30 = c24 ^ c22;
c31 = c24 ^ c23;
tbl[16] = 0;
tbl[17] = c17;
tbl[18] = c18;
tbl[19] = c19;
tbl[20] = c20;
tbl[21] = c21;
tbl[22] = c22;
tbl[23] = c23;
tbl[24] = c24;
tbl[25] = c25;
tbl[26] = c26;
tbl[27] = c27;
tbl[28] = c28;
tbl[29] = c29;
tbl[30] = c30;
tbl[31] = c31;
#endif //__WORDSIZE == 64 || _WIN64 || __x86_64__
}
void gf_vect_dot_prod_base(int len, int vlen, unsigned char *v,
unsigned char **src, unsigned char *dest)
{
int i, j;
unsigned char s;
for (i = 0; i < len; i++) {
s = 0;
for (j = 0; j < vlen; j++)
s ^= gf_mul(src[j][i], v[j * 32 + 1]);
dest[i] = s;
}
}
void gf_vect_mad_base(int len, int vec, int vec_i,
unsigned char *v, unsigned char *src, unsigned char *dest)
{
int i;
unsigned char s;
for (i = 0; i < len; i++) {
s = dest[i];
s ^= gf_mul(src[i], v[vec_i * 32 + 1]);
dest[i] = s;
}
}
void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v,
unsigned char **src, unsigned char **dest)
{
int i, j, l;
unsigned char s;
for (l = 0; l < dests; l++) {
for (i = 0; i < len; i++) {
s = 0;
for (j = 0; j < srcs; j++)
s ^= gf_mul(src[j][i], v[j * 32 + l * srcs * 32 + 1]);
dest[l][i] = s;
}
}
}
void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v,
unsigned char *data, unsigned char **dest)
{
int i, l;
unsigned char s;
for (l = 0; l < rows; l++) {
for (i = 0; i < len; i++) {
s = dest[l][i];
s ^= gf_mul(data[i], v[vec_i * 32 + l * k * 32 + 1]);
dest[l][i] = s;
}
}
}
void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, unsigned char *dest)
{
//2nd element of table array is ref value used to fill it in
unsigned char c = a[1];
while (len-- > 0)
*dest++ = gf_mul(c, *src++);
}
struct slver {
UINT16 snum;
UINT8 ver;
UINT8 core;
};
// Version info
struct slver gf_vect_mul_init_slver_00020035;
struct slver gf_vect_mul_init_slver = { 0x0035, 0x02, 0x00 };
struct slver ec_encode_data_base_slver_00010135;
struct slver ec_encode_data_base_slver = { 0x0135, 0x01, 0x00 };
struct slver gf_vect_mul_base_slver_00010136;
struct slver gf_vect_mul_base_slver = { 0x0136, 0x01, 0x00 };
struct slver gf_vect_dot_prod_base_slver_00010137;
struct slver gf_vect_dot_prod_base_slver = { 0x0137, 0x01, 0x00 };
struct slver gf_mul_slver_00000214;
struct slver gf_mul_slver = { 0x0214, 0x00, 0x00 };
struct slver gf_invert_matrix_slver_00000215;
struct slver gf_invert_matrix_slver = { 0x0215, 0x00, 0x00};
struct slver gf_gen_rs_matrix_slver_00000216;
struct slver gf_gen_rs_matrix_slver = { 0x0216, 0x00, 0x00 };
struct slver gf_gen_cauchy1_matrix_slver_00000217;
struct slver gf_gen_cauchy1_matrix_slver = { 0x0217, 0x00, 0x00};

6680
erasure_code/ec_base.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,267 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <limits.h>
#include "erasure_code.h"
#include "types.h"
void ec_init_tables(int k, int rows, unsigned char *a, unsigned char *g_tbls)
{
int i, j;
for (i = 0; i < rows; i++) {
for (j = 0; j < k; j++) {
gf_vect_mul_init(*a++, g_tbls);
g_tbls += 32;
}
}
}
void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
unsigned char **coding)
{
if (len < 16) {
ec_encode_data_base(len, k, rows, g_tbls, data, coding);
return;
}
while (rows >= 4) {
gf_4vect_dot_prod_sse(len, k, g_tbls, data, coding);
g_tbls += 4 * k * 32;
coding += 4;
rows -= 4;
}
switch (rows) {
case 3:
gf_3vect_dot_prod_sse(len, k, g_tbls, data, coding);
break;
case 2:
gf_2vect_dot_prod_sse(len, k, g_tbls, data, coding);
break;
case 1:
gf_vect_dot_prod_sse(len, k, g_tbls, data, *coding);
break;
case 0:
break;
}
}
void ec_encode_data_avx(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
unsigned char **coding)
{
if (len < 16) {
ec_encode_data_base(len, k, rows, g_tbls, data, coding);
return;
}
while (rows >= 4) {
gf_4vect_dot_prod_avx(len, k, g_tbls, data, coding);
g_tbls += 4 * k * 32;
coding += 4;
rows -= 4;
}
switch (rows) {
case 3:
gf_3vect_dot_prod_avx(len, k, g_tbls, data, coding);
break;
case 2:
gf_2vect_dot_prod_avx(len, k, g_tbls, data, coding);
break;
case 1:
gf_vect_dot_prod_avx(len, k, g_tbls, data, *coding);
break;
case 0:
break;
}
}
void ec_encode_data_avx2(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
unsigned char **coding)
{
if (len < 32) {
ec_encode_data_base(len, k, rows, g_tbls, data, coding);
return;
}
while (rows >= 4) {
gf_4vect_dot_prod_avx2(len, k, g_tbls, data, coding);
g_tbls += 4 * k * 32;
coding += 4;
rows -= 4;
}
switch (rows) {
case 3:
gf_3vect_dot_prod_avx2(len, k, g_tbls, data, coding);
break;
case 2:
gf_2vect_dot_prod_avx2(len, k, g_tbls, data, coding);
break;
case 1:
gf_vect_dot_prod_avx2(len, k, g_tbls, data, *coding);
break;
case 0:
break;
}
}
#if __WORDSIZE == 64 || _WIN64 || __x86_64__
void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
unsigned char *data, unsigned char **coding)
{
if (len < 16) {
ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
return;
}
while (rows > 6) {
gf_6vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
g_tbls += 6 * k * 32;
coding += 6;
rows -= 6;
}
switch (rows) {
case 6:
gf_6vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
break;
case 5:
gf_5vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
break;
case 4:
gf_4vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
break;
case 3:
gf_3vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
break;
case 2:
gf_2vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
break;
case 1:
gf_vect_mad_sse(len, k, vec_i, g_tbls, data, *coding);
break;
case 0:
break;
}
}
void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
unsigned char *data, unsigned char **coding)
{
if (len < 16) {
ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
return;
}
while (rows > 6) {
gf_6vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
g_tbls += 6 * k * 32;
coding += 6;
rows -= 6;
}
switch (rows) {
case 6:
gf_6vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
break;
case 5:
gf_5vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
break;
case 4:
gf_4vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
break;
case 3:
gf_3vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
break;
case 2:
gf_2vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
break;
case 1:
gf_vect_mad_avx(len, k, vec_i, g_tbls, data, *coding);
break;
case 0:
break;
}
}
void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
unsigned char *data, unsigned char **coding)
{
if (len < 32) {
ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
return;
}
while (rows > 6) {
gf_6vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
g_tbls += 6 * k * 32;
coding += 6;
rows -= 6;
}
switch (rows) {
case 6:
gf_6vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
break;
case 5:
gf_5vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
break;
case 4:
gf_4vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
break;
case 3:
gf_3vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
break;
case 2:
gf_2vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
break;
case 1:
gf_vect_mad_avx2(len, k, vec_i, g_tbls, data, *coding);
break;
case 0:
break;
}
}
#endif //__WORDSIZE == 64 || _WIN64 || __x86_64__
struct slver {
UINT16 snum;
UINT8 ver;
UINT8 core;
};
// Version info
struct slver ec_init_tables_slver_00010068;
struct slver ec_init_tables_slver = { 0x0068, 0x01, 0x00 };
struct slver ec_encode_data_sse_slver_00020069;
struct slver ec_encode_data_sse_slver = { 0x0069, 0x02, 0x00 };

View File

@ -0,0 +1,395 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%ifidn __OUTPUT_FORMAT__, elf64
%define WRT_OPT wrt ..plt
%else
%define WRT_OPT
%endif
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf32
[bits 32]
%define def_wrd dd
%define wrd_sz dword
%define arg1 esi
%define arg2 eax
%define arg3 ebx
%define arg4 ecx
%define arg5 edx
%else
default rel
[bits 64]
%define def_wrd dq
%define wrd_sz qword
%define arg1 rsi
%define arg2 rax
%define arg3 rbx
%define arg4 rcx
%define arg5 rdx
extern ec_encode_data_update_sse
extern ec_encode_data_update_avx
extern ec_encode_data_update_avx2
extern gf_vect_mul_sse
extern gf_vect_mul_avx
extern gf_vect_mad_sse
extern gf_vect_mad_avx
extern gf_vect_mad_avx2
%endif
extern gf_vect_mul_base
extern ec_encode_data_base
extern ec_encode_data_update_base
extern gf_vect_dot_prod_base
extern gf_vect_mad_base
extern gf_vect_dot_prod_sse
extern gf_vect_dot_prod_avx
extern gf_vect_dot_prod_avx2
extern ec_encode_data_sse
extern ec_encode_data_avx
extern ec_encode_data_avx2
section .data
;;; *_mbinit are initial values for *_dispatched; is updated on first call.
;;; Therefore, *_dispatch_init is only executed on first call.
ec_encode_data_dispatched:
def_wrd ec_encode_data_mbinit
gf_vect_mul_dispatched:
def_wrd gf_vect_mul_mbinit
gf_vect_dot_prod_dispatched:
def_wrd gf_vect_dot_prod_mbinit
ec_encode_data_update_dispatched:
def_wrd ec_encode_data_update_mbinit
gf_vect_mad_dispatched:
def_wrd gf_vect_mad_mbinit
section .text
;;;;
; ec_encode_data multibinary function
;;;;
global ec_encode_data:function
ec_encode_data_mbinit:
call ec_encode_data_dispatch_init
ec_encode_data:
jmp wrd_sz [ec_encode_data_dispatched]
ec_encode_data_dispatch_init:
push arg1
push arg2
push arg3
push arg4
push arg5
lea arg1, [ec_encode_data_base WRT_OPT] ; Default
mov eax, 1
cpuid
lea arg3, [ec_encode_data_sse WRT_OPT]
test ecx, FLAG_CPUID1_ECX_SSE4_1
cmovne arg1, arg3
and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
lea arg3, [ec_encode_data_avx WRT_OPT]
jne _done_ec_encode_data_init
mov arg1, arg3
;; Try for AVX2
xor ecx, ecx
mov eax, 7
cpuid
test ebx, FLAG_CPUID1_EBX_AVX2
lea arg3, [ec_encode_data_avx2 WRT_OPT]
cmovne arg1, arg3
;; Does it have xmm and ymm support
xor ecx, ecx
xgetbv
and eax, FLAG_XGETBV_EAX_XMM_YMM
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
je _done_ec_encode_data_init
lea arg1, [ec_encode_data_sse WRT_OPT]
_done_ec_encode_data_init:
pop arg5
pop arg4
pop arg3
pop arg2
mov [ec_encode_data_dispatched], arg1
pop arg1
ret
;;;;
; gf_vect_mul multibinary function
;;;;
global gf_vect_mul:function
gf_vect_mul_mbinit:
call gf_vect_mul_dispatch_init
gf_vect_mul:
jmp wrd_sz [gf_vect_mul_dispatched]
gf_vect_mul_dispatch_init:
push arg1
%ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check
lea arg1, [gf_vect_mul_base]
%else
push rax
push rbx
push rcx
push rdx
lea arg1, [gf_vect_mul_base WRT_OPT] ; Default
mov eax, 1
cpuid
test ecx, FLAG_CPUID1_ECX_SSE4_2
lea rbx, [gf_vect_mul_sse WRT_OPT]
je _done_gf_vect_mul_dispatch_init
mov arg1, rbx
;; Try for AVX
and ecx, (FLAG_CPUID1_ECX_OSXSAVE | FLAG_CPUID1_ECX_AVX)
cmp ecx, (FLAG_CPUID1_ECX_OSXSAVE | FLAG_CPUID1_ECX_AVX)
jne _done_gf_vect_mul_dispatch_init
;; Does it have xmm and ymm support
xor ecx, ecx
xgetbv
and eax, FLAG_XGETBV_EAX_XMM_YMM
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
jne _done_gf_vect_mul_dispatch_init
lea arg1, [gf_vect_mul_avx WRT_OPT]
_done_gf_vect_mul_dispatch_init:
pop rdx
pop rcx
pop rbx
pop rax
%endif ;; END 32-bit check
mov [gf_vect_mul_dispatched], arg1
pop arg1
ret
;;;;
; ec_encode_data_update multibinary function
;;;;
global ec_encode_data_update:function
ec_encode_data_update_mbinit:
call ec_encode_data_update_dispatch_init
ec_encode_data_update:
jmp wrd_sz [ec_encode_data_update_dispatched]
ec_encode_data_update_dispatch_init:
push arg1
%ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check
lea arg1, [ec_encode_data_update_base]
%else
push rax
push rbx
push rcx
push rdx
lea arg1, [ec_encode_data_update_base WRT_OPT] ; Default
mov eax, 1
cpuid
lea rbx, [ec_encode_data_update_sse WRT_OPT]
test ecx, FLAG_CPUID1_ECX_SSE4_1
cmovne arg1, rbx
and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
lea rbx, [ec_encode_data_update_avx WRT_OPT]
jne _done_ec_encode_data_update_init
mov rsi, rbx
;; Try for AVX2
xor ecx, ecx
mov eax, 7
cpuid
test ebx, FLAG_CPUID1_EBX_AVX2
lea rbx, [ec_encode_data_update_avx2 WRT_OPT]
cmovne rsi, rbx
;; Does it have xmm and ymm support
xor ecx, ecx
xgetbv
and eax, FLAG_XGETBV_EAX_XMM_YMM
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
je _done_ec_encode_data_update_init
lea rsi, [ec_encode_data_update_sse WRT_OPT]
_done_ec_encode_data_update_init:
pop rdx
pop rcx
pop rbx
pop rax
%endif ;; END 32-bit check
mov [ec_encode_data_update_dispatched], arg1
pop arg1
ret
;;;;
; gf_vect_dot_prod multibinary function
;;;;
global gf_vect_dot_prod:function
gf_vect_dot_prod_mbinit:
call gf_vect_dot_prod_dispatch_init
gf_vect_dot_prod:
jmp wrd_sz [gf_vect_dot_prod_dispatched]
gf_vect_dot_prod_dispatch_init:
push arg1
push arg2
push arg3
push arg4
push arg5
lea arg1, [gf_vect_dot_prod_base WRT_OPT] ; Default
mov eax, 1
cpuid
lea arg3, [gf_vect_dot_prod_sse WRT_OPT]
test ecx, FLAG_CPUID1_ECX_SSE4_1
cmovne arg1, arg3
and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
lea arg3, [gf_vect_dot_prod_avx WRT_OPT]
jne _done_gf_vect_dot_prod_init
mov arg1, arg3
;; Try for AVX2
xor ecx, ecx
mov eax, 7
cpuid
test ebx, FLAG_CPUID1_EBX_AVX2
lea arg3, [gf_vect_dot_prod_avx2 WRT_OPT]
cmovne arg1, arg3
;; Does it have xmm and ymm support
xor ecx, ecx
xgetbv
and eax, FLAG_XGETBV_EAX_XMM_YMM
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
je _done_gf_vect_dot_prod_init
lea arg1, [gf_vect_dot_prod_sse WRT_OPT]
_done_gf_vect_dot_prod_init:
pop arg5
pop arg4
pop arg3
pop arg2
mov [gf_vect_dot_prod_dispatched], arg1
pop arg1
ret
;;;;
; gf_vect_mad multibinary function
;;;;
global gf_vect_mad:function
gf_vect_mad_mbinit:
call gf_vect_mad_dispatch_init
gf_vect_mad:
jmp wrd_sz [gf_vect_mad_dispatched]
gf_vect_mad_dispatch_init:
push arg1
%ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check
lea arg1, [gf_vect_mad_base]
%else
push rax
push rbx
push rcx
push rdx
lea arg1, [gf_vect_mad_base WRT_OPT] ; Default
mov eax, 1
cpuid
lea rbx, [gf_vect_mad_sse WRT_OPT]
test ecx, FLAG_CPUID1_ECX_SSE4_1
cmovne arg1, rbx
and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
lea rbx, [gf_vect_mad_avx WRT_OPT]
jne _done_gf_vect_mad_init
mov rsi, rbx
;; Try for AVX2
xor ecx, ecx
mov eax, 7
cpuid
test ebx, FLAG_CPUID1_EBX_AVX2
lea rbx, [gf_vect_mad_avx2 WRT_OPT]
cmovne rsi, rbx
;; Does it have xmm and ymm support
xor ecx, ecx
xgetbv
and eax, FLAG_XGETBV_EAX_XMM_YMM
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
je _done_gf_vect_mad_init
lea rsi, [gf_vect_mad_sse WRT_OPT]
_done_gf_vect_mad_init:
pop rdx
pop rcx
pop rbx
pop rax
%endif ;; END 32-bit check
mov [gf_vect_mad_dispatched], arg1
pop arg1
ret
;;; func core, ver, snum
slversion ec_encode_data, 00, 04, 0133
slversion gf_vect_mul, 00, 03, 0134
slversion ec_encode_data_update, 00, 03, 0212
slversion gf_vect_dot_prod, 00, 03, 0138
slversion gf_vect_mad, 00, 02, 0213

View File

@ -0,0 +1,168 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "test.h"
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 32
# define TEST_LEN(m) ((128*1024 / m) & ~(64-1))
# define TEST_LOOPS(m) (100*m)
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 32
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
# define TEST_LOOPS(m) (10)
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS(m) 1000
# endif
# endif
#endif
#define MMAX TEST_SOURCES
#define KMAX TEST_SOURCES
typedef unsigned char u8;
int main(int argc, char *argv[])
{
int i, j, rtest, m, k, nerrs, r;
void *buf;
u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
u8 a[MMAX * KMAX], b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX];
u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
struct perf start, stop;
// Pick test parameters
m = 14;
k = 10;
nerrs = 4;
const u8 err_list[] = { 2, 4, 5, 7 };
printf("erasure_code_base_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
if (m > MMAX || k > KMAX || nerrs > (m - k)) {
printf(" Input test parameter error\n");
return -1;
}
memcpy(src_err_list, err_list, nerrs);
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0; i < nerrs; i++)
src_in_err[src_err_list[i]] = 1;
// Allocate the arrays
for (i = 0; i < m; i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
printf("alloc error: Fail\n");
return -1;
}
buffs[i] = buf;
}
for (i = 0; i < (m - k); i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
printf("alloc error: Fail\n");
return -1;
}
temp_buffs[i] = buf;
}
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN(m); j++)
buffs[i][j] = rand();
gf_gen_rs_matrix(a, m, k);
ec_init_tables(k, m - k, &a[k * k], g_tbls);
ec_encode_data_base(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
// Start encode test
perf_start(&start);
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
// Make parity vects
ec_init_tables(k, m - k, &a[k * k], g_tbls);
ec_encode_data_base(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
}
perf_stop(&stop);
printf("erasure_code_base_encode" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
// Start decode test
perf_start(&start);
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
// Construct b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r])
r++;
recov[i] = buffs[r];
for (j = 0; j < k; j++)
b[k * i + j] = a[k * r + j];
}
if (gf_invert_matrix(b, d, k) < 0) {
printf("BAD MATRIX\n");
return -1;
}
for (i = 0; i < nerrs; i++)
for (j = 0; j < k; j++)
c[k * i + j] = d[k * src_err_list[i] + j];
// Recover data
ec_init_tables(k, nerrs, c, g_tbls);
ec_encode_data_base(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs);
}
perf_stop(&stop);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[i], buffs[src_err_list[i]], TEST_LEN(m))) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
return -1;
}
}
printf("erasure_code_base_decode" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)(TEST_LEN(m)) * (k + nerrs) * rtest);
printf("done all: Pass\n");
return 0;
}

View File

@ -0,0 +1,764 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#ifndef TEST_SOURCES
# define TEST_SOURCES 127
#endif
#ifndef RANDOMS
# define RANDOMS 50
#endif
#define MMAX TEST_SOURCES
#define KMAX TEST_SOURCES
#define EFENCE_TEST_MIN_SIZE 16
#ifdef EC_ALIGNED_ADDR
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 0
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
#else
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 32
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
#endif
#ifndef TEST_SEED
#define TEST_SEED 11
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
// Generate Random errors
static void gen_err_list(unsigned char *src_err_list,
unsigned char *src_in_err, int *pnerrs, int *pnsrcerrs, int k, int m)
{
int i, err;
int nerrs = 0, nsrcerrs = 0;
for (i = 0, nerrs = 0, nsrcerrs = 0; i < m && nerrs < m - k; i++) {
err = 1 & rand();
src_in_err[i] = err;
if (err) {
src_err_list[nerrs++] = i;
if (i < k) {
nsrcerrs++;
}
}
}
if (nerrs == 0) { // should have at least one error
while ((err = (rand() % KMAX)) >= m) ;
src_err_list[nerrs++] = err;
src_in_err[err] = 1;
if (err < k)
nsrcerrs = 1;
}
*pnerrs = nerrs;
*pnsrcerrs = nsrcerrs;
return;
}
#define NO_INVERT_MATRIX -2
// Generate decode matrix from encode matrix
static int gf_gen_decode_matrix(unsigned char *encode_matrix,
unsigned char *decode_matrix,
unsigned char *invert_matrix,
unsigned int *decode_index,
unsigned char *src_err_list,
unsigned char *src_in_err,
int nerrs, int nsrcerrs, int k, int m)
{
int i, j, p;
int r;
unsigned char *backup, *b, s;
int incr = 0;
b = malloc(MMAX * KMAX);
backup = malloc(MMAX * KMAX);
if (b == NULL || backup == NULL) {
printf("Test failure! Error with malloc\n");
free(b);
free(backup);
return -1;
}
// Construct matrix b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r])
r++;
for (j = 0; j < k; j++) {
b[k * i + j] = encode_matrix[k * r + j];
backup[k * i + j] = encode_matrix[k * r + j];
}
decode_index[i] = r;
}
incr = 0;
while (gf_invert_matrix(b, invert_matrix, k) < 0) {
if (nerrs == (m - k)) {
free(b);
free(backup);
printf("BAD MATRIX\n");
return NO_INVERT_MATRIX;
}
incr++;
memcpy(b, backup, MMAX * KMAX);
for (i = nsrcerrs; i < nerrs - nsrcerrs; i++) {
if (src_err_list[i] == (decode_index[k - 1] + incr)) {
// skip the erased parity line
incr++;
continue;
}
}
if (decode_index[k - 1] + incr >= m) {
free(b);
free(backup);
printf("BAD MATRIX\n");
return NO_INVERT_MATRIX;
}
decode_index[k - 1] += incr;
for (j = 0; j < k; j++)
b[k * (k - 1) + j] = encode_matrix[k * decode_index[k - 1] + j];
};
for (i = 0; i < nsrcerrs; i++) {
for (j = 0; j < k; j++) {
decode_matrix[k * i + j] = invert_matrix[k * src_err_list[i] + j];
}
}
/* src_err_list from encode_matrix * invert of b for parity decoding */
for (p = nsrcerrs; p < nerrs; p++) {
for (i = 0; i < k; i++) {
s = 0;
for (j = 0; j < k; j++)
s ^= gf_mul(invert_matrix[j * k + i],
encode_matrix[k * src_err_list[p] + j]);
decode_matrix[k * p + i] = s;
}
}
free(b);
free(backup);
return 0;
}
int main(int argc, char *argv[])
{
int re = 0;
int i, j, p, rtest, m, k;
int nerrs, nsrcerrs;
void *buf;
unsigned int decode_index[MMAX];
unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
unsigned char *recov[TEST_SOURCES];
int rows, align, size;
unsigned char *efence_buffs[TEST_SOURCES];
unsigned int offset;
u8 *ubuffs[TEST_SOURCES];
u8 *temp_ubuffs[TEST_SOURCES];
printf("erasure_code_base_test: %dx%d ", TEST_SOURCES, TEST_LEN);
srand(TEST_SEED);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
temp_buffs[i] = buf;
}
// Test erasure code by encode and recovery
encode_matrix = malloc(MMAX * KMAX);
decode_matrix = malloc(MMAX * KMAX);
invert_matrix = malloc(MMAX * KMAX);
g_tbls = malloc(KMAX * TEST_SOURCES * 32);
if (encode_matrix == NULL || decode_matrix == NULL
|| invert_matrix == NULL || g_tbls == NULL) {
printf("Test failure! Error with malloc\n");
return -1;
}
// Pick a first test
m = 9;
k = 5;
if (m > MMAX || k > KMAX)
return -1;
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// Generate encode matrix encode_matrix
// The matrix generated by gf_gen_rs_matrix
// is not always invertable.
gf_gen_rs_matrix(encode_matrix, m, k);
// Generate g_tbls from encode matrix encode_matrix
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix encode_matrix
ec_encode_data_base(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
// Choose random buffers to be in erasure
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list, src_in_err,
nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
return -1;
}
}
// Pick a first test
m = 9;
k = 5;
if (m > MMAX || k > KMAX)
return -1;
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Generate g_tbls from encode matrix encode_matrix
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix encode_matrix
ec_encode_data_base(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
// Choose random buffers to be in erasure
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list, src_in_err,
nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
return -1;
}
}
// Do more random tests
for (rtest = 0; rtest < RANDOMS; rtest++) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data_base(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("orig data:\n");
dump_matrix(buffs, m, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
return -1;
}
}
putchar('.');
}
// Run tests at end of buffer for Electric Fence
k = 16;
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
if (k > KMAX)
return -1;
for (rows = 1; rows <= 16; rows++) {
m = k + rows;
if (m > MMAX)
return -1;
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (size = EFENCE_TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
for (i = 0; i < m; i++) { // Line up TEST_SIZE from end
efence_buffs[i] = buffs[i] + TEST_LEN - size;
}
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data_base(size, k, m - k, g_tbls, efence_buffs,
&efence_buffs[k]);
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = efence_buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 !=
memcmp(temp_buffs[k + i], efence_buffs[src_err_list[i]],
size)) {
printf("Efence: Fail error recovery (%d, %d, %d)\n", m,
k, nerrs);
printf("size = %d\n", size);
printf("Test erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], align);
printf("orig :");
dump(efence_buffs[src_err_list[i]], align);
return -1;
}
}
}
}
// Test rand ptr alignment if available
for (rtest = 0; rtest < RANDOMS; rtest++) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
// Add random offsets
for (i = 0; i < m; i++) {
memset(buffs[i], 0, TEST_LEN); // zero pad to check write-over
memset(temp_buffs[i], 0, TEST_LEN); // zero pad to check write-over
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
temp_ubuffs[i] = temp_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
}
for (i = 0; i < k; i++)
for (j = 0; j < size; j++)
ubuffs[i][j] = rand();
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data_base(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]);
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = ubuffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_ubuffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_ubuffs[k + i], ubuffs[src_err_list[i]], size)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((unsigned char *)encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((unsigned char *)invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((unsigned char *)decode_matrix, m, k);
printf("orig data:\n");
dump_matrix(ubuffs, m, 25);
printf("orig :");
dump(ubuffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_ubuffs[k + i], 25);
return -1;
}
}
// Confirm that padding around dests is unchanged
memset(temp_buffs[0], 0, PTR_ALIGN_CHK_B); // Make reference zero buff
for (i = 0; i < m; i++) {
offset = ubuffs[i] - buffs[i];
if (memcmp(buffs[i], temp_buffs[0], offset)) {
printf("Fail rand ualign encode pad start\n");
return -1;
}
if (memcmp
(buffs[i] + offset + size, temp_buffs[0],
PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign encode pad end\n");
return -1;
}
}
for (i = 0; i < nerrs; i++) {
offset = temp_ubuffs[k + i] - temp_buffs[k + i];
if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
printf("Fail rand ualign decode pad start\n");
return -1;
}
if (memcmp
(temp_buffs[k + i] + offset + size, temp_buffs[0],
PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign decode pad end\n");
return -1;
}
}
putchar('.');
}
// Test size alignment
align = (LEN_ALIGN_CHK_B != 0) ? 13 : 16;
for (size = TEST_LEN; size > 0; size -= align) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
for (i = 0; i < k; i++)
for (j = 0; j < size; j++)
buffs[i][j] = rand();
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data_base(size, k, m - k, g_tbls, buffs, &buffs[k]);
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], size)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((unsigned char *)encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((unsigned char *)invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((unsigned char *)decode_matrix, m, k);
printf("orig data:\n");
dump_matrix(buffs, m, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
return -1;
}
}
}
printf("done EC tests: Pass\n");
return 0;
}

View File

@ -0,0 +1,168 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "test.h"
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 32
# define TEST_LEN(m) ((128*1024 / m) & ~(64-1))
# define TEST_LOOPS(m) (10000*m)
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 32
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
# define TEST_LOOPS(m) (50*m)
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS(m) 1000
# endif
# endif
#endif
#define MMAX TEST_SOURCES
#define KMAX TEST_SOURCES
typedef unsigned char u8;
int main(int argc, char *argv[])
{
int i, j, rtest, m, k, nerrs, r;
void *buf;
u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
u8 a[MMAX * KMAX], b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX];
u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
struct perf start, stop;
// Pick test parameters
m = 14;
k = 10;
nerrs = 4;
const u8 err_list[] = { 2, 4, 5, 7 };
printf("erasure_code_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
if (m > MMAX || k > KMAX || nerrs > (m - k)) {
printf(" Input test parameter error\n");
return -1;
}
memcpy(src_err_list, err_list, nerrs);
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0; i < nerrs; i++)
src_in_err[src_err_list[i]] = 1;
// Allocate the arrays
for (i = 0; i < m; i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
printf("alloc error: Fail\n");
return -1;
}
buffs[i] = buf;
}
for (i = 0; i < (m - k); i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
printf("alloc error: Fail\n");
return -1;
}
temp_buffs[i] = buf;
}
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN(m); j++)
buffs[i][j] = rand();
gf_gen_rs_matrix(a, m, k);
ec_init_tables(k, m - k, &a[k * k], g_tbls);
ec_encode_data(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
// Start encode test
perf_start(&start);
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
// Make parity vects
ec_init_tables(k, m - k, &a[k * k], g_tbls);
ec_encode_data(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
}
perf_stop(&stop);
printf("erasure_code_encode" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
// Start decode test
perf_start(&start);
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
// Construct b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r])
r++;
recov[i] = buffs[r];
for (j = 0; j < k; j++)
b[k * i + j] = a[k * r + j];
}
if (gf_invert_matrix(b, d, k) < 0) {
printf("BAD MATRIX\n");
return -1;
}
for (i = 0; i < nerrs; i++)
for (j = 0; j < k; j++)
c[k * i + j] = d[k * src_err_list[i] + j];
// Recover data
ec_init_tables(k, nerrs, c, g_tbls);
ec_encode_data(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs);
}
perf_stop(&stop);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[i], buffs[src_err_list[i]], TEST_LEN(m))) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
return -1;
}
}
printf("erasure_code_decode" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)(TEST_LEN(m)) * (k + nerrs) * rtest);
printf("done all: Pass\n");
return 0;
}

View File

@ -0,0 +1,168 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "test.h"
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 32
# define TEST_LEN(m) ((128*1024 / m) & ~(64-1))
# define TEST_LOOPS(m) (10000*m)
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 32
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
# define TEST_LOOPS(m) (50*m)
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS(m) 1000
# endif
# endif
#endif
#define MMAX TEST_SOURCES
#define KMAX TEST_SOURCES
typedef unsigned char u8;
int main(int argc, char *argv[])
{
int i, j, rtest, m, k, nerrs, r;
void *buf;
u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
u8 a[MMAX * KMAX], b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX];
u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
struct perf start, stop;
// Pick test parameters
m = 14;
k = 10;
nerrs = 4;
const u8 err_list[] = { 2, 4, 5, 7 };
printf("erasure_code_sse_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
if (m > MMAX || k > KMAX || nerrs > (m - k)) {
printf(" Input test parameter error\n");
return -1;
}
memcpy(src_err_list, err_list, nerrs);
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0; i < nerrs; i++)
src_in_err[src_err_list[i]] = 1;
// Allocate the arrays
for (i = 0; i < m; i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
printf("alloc error: Fail\n");
return -1;
}
buffs[i] = buf;
}
for (i = 0; i < (m - k); i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
printf("alloc error: Fail\n");
return -1;
}
temp_buffs[i] = buf;
}
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN(m); j++)
buffs[i][j] = rand();
gf_gen_rs_matrix(a, m, k);
ec_init_tables(k, m - k, &a[k * k], g_tbls);
ec_encode_data_sse(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
// Start encode test
perf_start(&start);
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
// Make parity vects
ec_init_tables(k, m - k, &a[k * k], g_tbls);
ec_encode_data_sse(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
}
perf_stop(&stop);
printf("erasure_code_sse_encode" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
// Start decode test
perf_start(&start);
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
// Construct b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r])
r++;
recov[i] = buffs[r];
for (j = 0; j < k; j++)
b[k * i + j] = a[k * r + j];
}
if (gf_invert_matrix(b, d, k) < 0) {
printf("BAD MATRIX\n");
return -1;
}
for (i = 0; i < nerrs; i++)
for (j = 0; j < k; j++)
c[k * i + j] = d[k * src_err_list[i] + j];
// Recover data
ec_init_tables(k, nerrs, c, g_tbls);
ec_encode_data_sse(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs);
}
perf_stop(&stop);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[i], buffs[src_err_list[i]], TEST_LEN(m))) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
return -1;
}
}
printf("erasure_code_sse_decode" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)(TEST_LEN(m)) * (k + nerrs) * rtest);
printf("done all: Pass\n");
return 0;
}

View File

@ -0,0 +1,764 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#ifndef TEST_SOURCES
# define TEST_SOURCES 127
#endif
#ifndef RANDOMS
# define RANDOMS 200
#endif
#define MMAX TEST_SOURCES
#define KMAX TEST_SOURCES
#define EFENCE_TEST_MIN_SIZE 16
#ifdef EC_ALIGNED_ADDR
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 0
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
#else
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 32
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
#endif
#ifndef TEST_SEED
#define TEST_SEED 11
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
// Generate Random errors
static void gen_err_list(unsigned char *src_err_list,
unsigned char *src_in_err, int *pnerrs, int *pnsrcerrs, int k, int m)
{
int i, err;
int nerrs = 0, nsrcerrs = 0;
for (i = 0, nerrs = 0, nsrcerrs = 0; i < m && nerrs < m - k; i++) {
err = 1 & rand();
src_in_err[i] = err;
if (err) {
src_err_list[nerrs++] = i;
if (i < k) {
nsrcerrs++;
}
}
}
if (nerrs == 0) { // should have at least one error
while ((err = (rand() % KMAX)) >= m) ;
src_err_list[nerrs++] = err;
src_in_err[err] = 1;
if (err < k)
nsrcerrs = 1;
}
*pnerrs = nerrs;
*pnsrcerrs = nsrcerrs;
return;
}
#define NO_INVERT_MATRIX -2
// Generate decode matrix from encode matrix
static int gf_gen_decode_matrix(unsigned char *encode_matrix,
unsigned char *decode_matrix,
unsigned char *invert_matrix,
unsigned int *decode_index,
unsigned char *src_err_list,
unsigned char *src_in_err,
int nerrs, int nsrcerrs, int k, int m)
{
int i, j, p;
int r;
unsigned char *backup, *b, s;
int incr = 0;
b = malloc(MMAX * KMAX);
backup = malloc(MMAX * KMAX);
if (b == NULL || backup == NULL) {
printf("Test failure! Error with malloc\n");
free(b);
free(backup);
return -1;
}
// Construct matrix b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r])
r++;
for (j = 0; j < k; j++) {
b[k * i + j] = encode_matrix[k * r + j];
backup[k * i + j] = encode_matrix[k * r + j];
}
decode_index[i] = r;
}
incr = 0;
while (gf_invert_matrix(b, invert_matrix, k) < 0) {
if (nerrs == (m - k)) {
free(b);
free(backup);
printf("BAD MATRIX\n");
return NO_INVERT_MATRIX;
}
incr++;
memcpy(b, backup, MMAX * KMAX);
for (i = nsrcerrs; i < nerrs - nsrcerrs; i++) {
if (src_err_list[i] == (decode_index[k - 1] + incr)) {
// skip the erased parity line
incr++;
continue;
}
}
if (decode_index[k - 1] + incr >= m) {
free(b);
free(backup);
printf("BAD MATRIX\n");
return NO_INVERT_MATRIX;
}
decode_index[k - 1] += incr;
for (j = 0; j < k; j++)
b[k * (k - 1) + j] = encode_matrix[k * decode_index[k - 1] + j];
};
for (i = 0; i < nsrcerrs; i++) {
for (j = 0; j < k; j++) {
decode_matrix[k * i + j] = invert_matrix[k * src_err_list[i] + j];
}
}
/* src_err_list from encode_matrix * invert of b for parity decoding */
for (p = nsrcerrs; p < nerrs; p++) {
for (i = 0; i < k; i++) {
s = 0;
for (j = 0; j < k; j++)
s ^= gf_mul(invert_matrix[j * k + i],
encode_matrix[k * src_err_list[p] + j]);
decode_matrix[k * p + i] = s;
}
}
free(b);
free(backup);
return 0;
}
int main(int argc, char *argv[])
{
int re = 0;
int i, j, p, rtest, m, k;
int nerrs, nsrcerrs;
void *buf;
unsigned int decode_index[MMAX];
unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
unsigned char *recov[TEST_SOURCES];
int rows, align, size;
unsigned char *efence_buffs[TEST_SOURCES];
unsigned int offset;
u8 *ubuffs[TEST_SOURCES];
u8 *temp_ubuffs[TEST_SOURCES];
printf("erasure_code_sse_test: %dx%d ", TEST_SOURCES, TEST_LEN);
srand(TEST_SEED);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
temp_buffs[i] = buf;
}
// Test erasure code by encode and recovery
encode_matrix = malloc(MMAX * KMAX);
decode_matrix = malloc(MMAX * KMAX);
invert_matrix = malloc(MMAX * KMAX);
g_tbls = malloc(KMAX * TEST_SOURCES * 32);
if (encode_matrix == NULL || decode_matrix == NULL
|| invert_matrix == NULL || g_tbls == NULL) {
printf("Test failure! Error with malloc\n");
return -1;
}
// Pick a first test
m = 9;
k = 5;
if (m > MMAX || k > KMAX)
return -1;
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// Generate encode matrix encode_matrix
// The matrix generated by gf_gen_rs_matrix
// is not always invertable.
gf_gen_rs_matrix(encode_matrix, m, k);
// Generate g_tbls from encode matrix encode_matrix
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix encode_matrix
ec_encode_data_sse(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
// Choose random buffers to be in erasure
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list, src_in_err,
nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_sse(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
return -1;
}
}
// Pick a first test
m = 9;
k = 5;
if (m > MMAX || k > KMAX)
return -1;
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Generate g_tbls from encode matrix encode_matrix
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix encode_matrix
ec_encode_data_sse(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
// Choose random buffers to be in erasure
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list, src_in_err,
nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_sse(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
return -1;
}
}
// Do more random tests
for (rtest = 0; rtest < RANDOMS; rtest++) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data_sse(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_sse(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("orig data:\n");
dump_matrix(buffs, m, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
return -1;
}
}
putchar('.');
}
// Run tests at end of buffer for Electric Fence
k = 16;
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
if (k > KMAX)
return -1;
for (rows = 1; rows <= 16; rows++) {
m = k + rows;
if (m > MMAX)
return -1;
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (size = EFENCE_TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
for (i = 0; i < m; i++) { // Line up TEST_SIZE from end
efence_buffs[i] = buffs[i] + TEST_LEN - size;
}
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data_sse(size, k, m - k, g_tbls, efence_buffs,
&efence_buffs[k]);
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = efence_buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_sse(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 !=
memcmp(temp_buffs[k + i], efence_buffs[src_err_list[i]],
size)) {
printf("Efence: Fail error recovery (%d, %d, %d)\n", m,
k, nerrs);
printf("size = %d\n", size);
printf("Test erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], align);
printf("orig :");
dump(efence_buffs[src_err_list[i]], align);
return -1;
}
}
}
}
// Test rand ptr alignment if available
for (rtest = 0; rtest < RANDOMS; rtest++) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
// Add random offsets
for (i = 0; i < m; i++) {
memset(buffs[i], 0, TEST_LEN); // zero pad to check write-over
memset(temp_buffs[i], 0, TEST_LEN); // zero pad to check write-over
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
temp_ubuffs[i] = temp_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
}
for (i = 0; i < k; i++)
for (j = 0; j < size; j++)
ubuffs[i][j] = rand();
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data_sse(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]);
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = ubuffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_sse(size, k, nerrs, g_tbls, recov, &temp_ubuffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_ubuffs[k + i], ubuffs[src_err_list[i]], size)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((unsigned char *)encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((unsigned char *)invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((unsigned char *)decode_matrix, m, k);
printf("orig data:\n");
dump_matrix(ubuffs, m, 25);
printf("orig :");
dump(ubuffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_ubuffs[k + i], 25);
return -1;
}
}
// Confirm that padding around dests is unchanged
memset(temp_buffs[0], 0, PTR_ALIGN_CHK_B); // Make reference zero buff
for (i = 0; i < m; i++) {
offset = ubuffs[i] - buffs[i];
if (memcmp(buffs[i], temp_buffs[0], offset)) {
printf("Fail rand ualign encode pad start\n");
return -1;
}
if (memcmp
(buffs[i] + offset + size, temp_buffs[0],
PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign encode pad end\n");
return -1;
}
}
for (i = 0; i < nerrs; i++) {
offset = temp_ubuffs[k + i] - temp_buffs[k + i];
if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
printf("Fail rand ualign decode pad start\n");
return -1;
}
if (memcmp
(temp_buffs[k + i] + offset + size, temp_buffs[0],
PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign decode pad end\n");
return -1;
}
}
putchar('.');
}
// Test size alignment
align = (LEN_ALIGN_CHK_B != 0) ? 13 : 16;
for (size = TEST_LEN; size > 0; size -= align) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
for (i = 0; i < k; i++)
for (j = 0; j < size; j++)
buffs[i][j] = rand();
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data_sse(size, k, m - k, g_tbls, buffs, &buffs[k]);
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data_sse(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], size)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((unsigned char *)encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((unsigned char *)invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((unsigned char *)decode_matrix, m, k);
printf("orig data:\n");
dump_matrix(buffs, m, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
return -1;
}
}
}
printf("done EC tests: Pass\n");
return 0;
}

View File

@ -0,0 +1,763 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#ifndef TEST_SOURCES
# define TEST_SOURCES 127
#endif
#ifndef RANDOMS
# define RANDOMS 200
#endif
#define MMAX TEST_SOURCES
#define KMAX TEST_SOURCES
#define EFENCE_TEST_MIN_SIZE 16
#ifdef EC_ALIGNED_ADDR
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 0
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
#else
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 32
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
#endif
#ifndef TEST_SEED
#define TEST_SEED 11
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
// Generate Random errors
static void gen_err_list(unsigned char *src_err_list,
unsigned char *src_in_err, int *pnerrs, int *pnsrcerrs, int k, int m)
{
int i, err;
int nerrs = 0, nsrcerrs = 0;
for (i = 0, nerrs = 0, nsrcerrs = 0; i < m && nerrs < m - k; i++) {
err = 1 & rand();
src_in_err[i] = err;
if (err) {
src_err_list[nerrs++] = i;
if (i < k) {
nsrcerrs++;
}
}
}
if (nerrs == 0) { // should have at least one error
while ((err = (rand() % KMAX)) >= m) ;
src_err_list[nerrs++] = err;
src_in_err[err] = 1;
if (err < k)
nsrcerrs = 1;
}
*pnerrs = nerrs;
*pnsrcerrs = nsrcerrs;
return;
}
#define NO_INVERT_MATRIX -2
// Generate decode matrix from encode matrix
static int gf_gen_decode_matrix(unsigned char *encode_matrix,
unsigned char *decode_matrix,
unsigned char *invert_matrix,
unsigned int *decode_index,
unsigned char *src_err_list,
unsigned char *src_in_err,
int nerrs, int nsrcerrs, int k, int m)
{
int i, j, p;
int r;
unsigned char *backup, *b, s;
int incr = 0;
b = malloc(MMAX * KMAX);
backup = malloc(MMAX * KMAX);
if (b == NULL || backup == NULL) {
printf("Test failure! Error with malloc\n");
free(b);
free(backup);
return -1;
}
// Construct matrix b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r])
r++;
for (j = 0; j < k; j++) {
b[k * i + j] = encode_matrix[k * r + j];
backup[k * i + j] = encode_matrix[k * r + j];
}
decode_index[i] = r;
}
incr = 0;
while (gf_invert_matrix(b, invert_matrix, k) < 0) {
if (nerrs == (m - k)) {
free(b);
free(backup);
printf("BAD MATRIX\n");
return NO_INVERT_MATRIX;
}
incr++;
memcpy(b, backup, MMAX * KMAX);
for (i = nsrcerrs; i < nerrs - nsrcerrs; i++) {
if (src_err_list[i] == (decode_index[k - 1] + incr)) {
// skip the erased parity line
incr++;
continue;
}
}
if (decode_index[k - 1] + incr >= m) {
free(b);
free(backup);
printf("BAD MATRIX\n");
return NO_INVERT_MATRIX;
}
decode_index[k - 1] += incr;
for (j = 0; j < k; j++)
b[k * (k - 1) + j] = encode_matrix[k * decode_index[k - 1] + j];
};
for (i = 0; i < nsrcerrs; i++) {
for (j = 0; j < k; j++) {
decode_matrix[k * i + j] = invert_matrix[k * src_err_list[i] + j];
}
}
/* src_err_list from encode_matrix * invert of b for parity decoding */
for (p = nsrcerrs; p < nerrs; p++) {
for (i = 0; i < k; i++) {
s = 0;
for (j = 0; j < k; j++)
s ^= gf_mul(invert_matrix[j * k + i],
encode_matrix[k * src_err_list[p] + j]);
decode_matrix[k * p + i] = s;
}
}
free(b);
free(backup);
return 0;
}
int main(int argc, char *argv[])
{
int re = 0;
int i, j, p, rtest, m, k;
int nerrs, nsrcerrs;
void *buf;
unsigned int decode_index[MMAX];
unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
unsigned char *recov[TEST_SOURCES];
int rows, align, size;
unsigned char *efence_buffs[TEST_SOURCES];
unsigned int offset;
u8 *ubuffs[TEST_SOURCES];
u8 *temp_ubuffs[TEST_SOURCES];
printf("erasure_code_test: %dx%d ", TEST_SOURCES, TEST_LEN);
srand(TEST_SEED);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
temp_buffs[i] = buf;
}
// Test erasure code by encode and recovery
encode_matrix = malloc(MMAX * KMAX);
decode_matrix = malloc(MMAX * KMAX);
invert_matrix = malloc(MMAX * KMAX);
g_tbls = malloc(KMAX * TEST_SOURCES * 32);
if (encode_matrix == NULL || decode_matrix == NULL
|| invert_matrix == NULL || g_tbls == NULL) {
printf("Test failure! Error with malloc\n");
return -1;
}
// Pick a first test
m = 9;
k = 5;
if (m > MMAX || k > KMAX)
return -1;
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// Generate encode matrix encode_matrix
// The matrix generated by gf_gen_rs_matrix
// is not always invertable.
gf_gen_rs_matrix(encode_matrix, m, k);
// Generate g_tbls from encode matrix encode_matrix
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix encode_matrix
ec_encode_data(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
// Choose random buffers to be in erasure
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list, src_in_err,
nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
return -1;
}
}
// Pick a first test
m = 9;
k = 5;
if (m > MMAX || k > KMAX)
return -1;
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Generate g_tbls from encode matrix encode_matrix
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix encode_matrix
ec_encode_data(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
// Choose random buffers to be in erasure
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list, src_in_err,
nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
return -1;
}
}
// Do more random tests
for (rtest = 0; rtest < RANDOMS; rtest++) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("orig data:\n");
dump_matrix(buffs, m, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
return -1;
}
}
putchar('.');
}
// Run tests at end of buffer for Electric Fence
k = 16;
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
if (k > KMAX)
return -1;
for (rows = 1; rows <= 16; rows++) {
m = k + rows;
if (m > MMAX)
return -1;
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (size = EFENCE_TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
for (i = 0; i < m; i++) { // Line up TEST_SIZE from end
efence_buffs[i] = buffs[i] + TEST_LEN - size;
}
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data(size, k, m - k, g_tbls, efence_buffs, &efence_buffs[k]);
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = efence_buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 !=
memcmp(temp_buffs[k + i], efence_buffs[src_err_list[i]],
size)) {
printf("Efence: Fail error recovery (%d, %d, %d)\n", m,
k, nerrs);
printf("size = %d\n", size);
printf("Test erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], align);
printf("orig :");
dump(efence_buffs[src_err_list[i]], align);
return -1;
}
}
}
}
// Test rand ptr alignment if available
for (rtest = 0; rtest < RANDOMS; rtest++) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
// Add random offsets
for (i = 0; i < m; i++) {
memset(buffs[i], 0, TEST_LEN); // zero pad to check write-over
memset(temp_buffs[i], 0, TEST_LEN); // zero pad to check write-over
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
temp_ubuffs[i] = temp_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
}
for (i = 0; i < k; i++)
for (j = 0; j < size; j++)
ubuffs[i][j] = rand();
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]);
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = ubuffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data(size, k, nerrs, g_tbls, recov, &temp_ubuffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_ubuffs[k + i], ubuffs[src_err_list[i]], size)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((unsigned char *)encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((unsigned char *)invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((unsigned char *)decode_matrix, m, k);
printf("orig data:\n");
dump_matrix(ubuffs, m, 25);
printf("orig :");
dump(ubuffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_ubuffs[k + i], 25);
return -1;
}
}
// Confirm that padding around dests is unchanged
memset(temp_buffs[0], 0, PTR_ALIGN_CHK_B); // Make reference zero buff
for (i = 0; i < m; i++) {
offset = ubuffs[i] - buffs[i];
if (memcmp(buffs[i], temp_buffs[0], offset)) {
printf("Fail rand ualign encode pad start\n");
return -1;
}
if (memcmp
(buffs[i] + offset + size, temp_buffs[0],
PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign encode pad end\n");
return -1;
}
}
for (i = 0; i < nerrs; i++) {
offset = temp_ubuffs[k + i] - temp_buffs[k + i];
if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
printf("Fail rand ualign decode pad start\n");
return -1;
}
if (memcmp
(temp_buffs[k + i] + offset + size, temp_buffs[0],
PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign decode pad end\n");
return -1;
}
}
putchar('.');
}
// Test size alignment
align = (LEN_ALIGN_CHK_B != 0) ? 13 : 16;
for (size = TEST_LEN; size > 0; size -= align) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
for (i = 0; i < k; i++)
for (j = 0; j < size; j++)
buffs[i][j] = rand();
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
ec_encode_data(size, k, m - k, g_tbls, buffs, &buffs[k]);
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
ec_encode_data(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], size)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((unsigned char *)encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((unsigned char *)invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((unsigned char *)decode_matrix, m, k);
printf("orig data:\n");
dump_matrix(buffs, m, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
return -1;
}
}
}
printf("done EC tests: Pass\n");
return 0;
}

View File

@ -0,0 +1,306 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#include "test.h"
//By default, test multibinary version
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST ec_encode_data_update
# define REF_FUNCTION ec_encode_data
#endif
//By default, test EC(8+4)
#if (!defined(VECT))
# define VECT 4
#endif
#define str(s) #s
#define xstr(s) str(s)
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 32
# define TEST_LEN(m) ((128*1024 / m) & ~(64-1))
# define TEST_LOOPS(m) (10000*m)
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 32
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
# define TEST_LOOPS(m) (50*m)
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS(m) 1000
# endif
# endif
#endif
#define MMAX TEST_SOURCES
#define KMAX TEST_SOURCES
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j, rtest, m, k, nerrs, r;
void *buf;
u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
u8 *update_buffs[TEST_SOURCES];
u8 *perf_update_buffs[TEST_SOURCES];
u8 a[MMAX * KMAX], b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX];
u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
struct perf start, stop;
// Pick test parameters
k = 10;
m = k + VECT;
nerrs = VECT;
const u8 err_list[] = { 0, 2, 4, 5, 7, 8 };
printf(xstr(FUNCTION_UNDER_TEST) "_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
if (m > MMAX || k > KMAX || nerrs > (m - k)) {
printf(" Input test parameter error\n");
return -1;
}
memcpy(src_err_list, err_list, nerrs);
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0; i < nerrs; i++)
src_in_err[src_err_list[i]] = 1;
// Allocate the arrays
for (i = 0; i < m; i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
printf("alloc error: Fail\n");
return -1;
}
buffs[i] = buf;
}
for (i = 0; i < (m - k); i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
printf("alloc error: Fail\n");
return -1;
}
temp_buffs[i] = buf;
memset(temp_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function
}
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
printf("alloc error: Fail");
return -1;
}
update_buffs[i] = buf;
memset(update_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function
}
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
printf("alloc error: Fail");
return -1;
}
perf_update_buffs[i] = buf;
memset(perf_update_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function
}
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN(m); j++) {
buffs[i][j] = rand();
update_buffs[i][j] = buffs[i][j];
}
gf_gen_rs_matrix(a, m, k);
ec_init_tables(k, m - k, &a[k * k], g_tbls);
REF_FUNCTION(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, i, g_tbls, update_buffs[i],
&update_buffs[k]);
}
for (i = 0; i < m - k; i++) {
if (0 != memcmp(update_buffs[k + i], buffs[k + i], TEST_LEN(m))) {
printf("\nupdate_buffs%d :", i);
dump(update_buffs[k + i], 25);
printf("buffs%d :", i);
dump(buffs[k + i], 25);
return -1;
}
}
#ifdef DO_REF_PERF
REF_FUNCTION(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
// Start encode test
perf_start(&start);
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
// Make parity vects
ec_init_tables(k, m - k, &a[k * k], g_tbls);
REF_FUNCTION(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
}
perf_stop(&stop);
printf(xstr(REF_FUNCTION) TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
#endif
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, i, g_tbls, perf_update_buffs[i],
&perf_update_buffs[k]);
}
// Start encode test
perf_start(&start);
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
// Make parity vects
ec_init_tables(k, m - k, &a[k * k], g_tbls);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, i, g_tbls,
perf_update_buffs[i], &perf_update_buffs[k]);
}
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
// Start encode test
perf_start(&start);
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
// Make parity vects
ec_init_tables(k, m - k, &a[k * k], g_tbls);
FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, 0, g_tbls, perf_update_buffs[0],
&perf_update_buffs[k]);
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) "_single_src" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m - k + 1) * rtest);
// Start encode test
perf_start(&start);
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
// Make parity vects
FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, 0, g_tbls, perf_update_buffs[0],
&perf_update_buffs[k]);
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) "_single_src_simple" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m - k + 1) * rtest);
for (i = k; i < m; i++) {
memset(update_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function
}
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, i, g_tbls, update_buffs[i],
&update_buffs[k]);
}
// Construct b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r])
r++;
recov[i] = update_buffs[r];
for (j = 0; j < k; j++)
b[k * i + j] = a[k * r + j];
}
if (gf_invert_matrix(b, d, k) < 0) {
printf("BAD MATRIX\n");
return -1;
}
for (i = 0; i < nerrs; i++)
for (j = 0; j < k; j++)
c[k * i + j] = d[k * src_err_list[i] + j];
// Recover data
ec_init_tables(k, nerrs, c, g_tbls);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(TEST_LEN(m), k, nerrs, i, g_tbls, recov[i], temp_buffs);
}
// Start decode test
perf_start(&start);
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
// Construct b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r])
r++;
recov[i] = update_buffs[r];
for (j = 0; j < k; j++)
b[k * i + j] = a[k * r + j];
}
if (gf_invert_matrix(b, d, k) < 0) {
printf("BAD MATRIX\n");
return -1;
}
for (i = 0; i < nerrs; i++)
for (j = 0; j < k; j++)
c[k * i + j] = d[k * src_err_list[i] + j];
// Recover data
ec_init_tables(k, nerrs, c, g_tbls);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(TEST_LEN(m), k, nerrs, i, g_tbls, recov[i],
perf_update_buffs);
}
}
perf_stop(&stop);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[i], update_buffs[src_err_list[i]], TEST_LEN(m))) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
return -1;
}
}
printf(xstr(FUNCTION_UNDER_TEST) "_decode" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)(TEST_LEN(m)) * (k + nerrs) * rtest);
printf("done all: Pass\n");
return 0;
}

View File

@ -0,0 +1,957 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#ifndef ALIGN_SIZE
# define ALIGN_SIZE 16
#endif
//By default, test multibinary version
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST ec_encode_data_update
# define REF_FUNCTION ec_encode_data
#endif
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#ifndef TEST_SOURCES
# define TEST_SOURCES 127
#endif
#ifndef RANDOMS
# define RANDOMS 200
#endif
#define MMAX TEST_SOURCES
#define KMAX TEST_SOURCES
#ifdef EC_ALIGNED_ADDR
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 0
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
#else
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B ALIGN_SIZE
# define LEN_ALIGN_CHK_B ALIGN_SIZE // 0 for aligned only
#endif
#ifndef TEST_SEED
#define TEST_SEED 11
#endif
#define str(s) #s
#define xstr(s) str(s)
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
// Generate Random errors
static void gen_err_list(unsigned char *src_err_list,
unsigned char *src_in_err, int *pnerrs, int *pnsrcerrs, int k, int m)
{
int i, err;
int nerrs = 0, nsrcerrs = 0;
for (i = 0, nerrs = 0, nsrcerrs = 0; i < m && nerrs < m - k; i++) {
err = 1 & rand();
src_in_err[i] = err;
if (err) {
src_err_list[nerrs++] = i;
if (i < k) {
nsrcerrs++;
}
}
}
if (nerrs == 0) { // should have at least one error
while ((err = (rand() % KMAX)) >= m) ;
src_err_list[nerrs++] = err;
src_in_err[err] = 1;
if (err < k)
nsrcerrs = 1;
}
*pnerrs = nerrs;
*pnsrcerrs = nsrcerrs;
return;
}
#define NO_INVERT_MATRIX -2
// Generate decode matrix from encode matrix
static int gf_gen_decode_matrix(unsigned char *encode_matrix,
unsigned char *decode_matrix,
unsigned char *invert_matrix,
unsigned int *decode_index,
unsigned char *src_err_list,
unsigned char *src_in_err,
int nerrs, int nsrcerrs, int k, int m)
{
int i, j, p;
int r;
unsigned char *backup, *b, s;
int incr = 0;
b = malloc(MMAX * KMAX);
backup = malloc(MMAX * KMAX);
if (b == NULL || backup == NULL) {
printf("Test failure! Error with malloc\n");
free(b);
free(backup);
return -1;
}
// Construct matrix b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r])
r++;
for (j = 0; j < k; j++) {
b[k * i + j] = encode_matrix[k * r + j];
backup[k * i + j] = encode_matrix[k * r + j];
}
decode_index[i] = r;
}
incr = 0;
while (gf_invert_matrix(b, invert_matrix, k) < 0) {
if (nerrs == (m - k)) {
free(b);
free(backup);
printf("BAD MATRIX\n");
return NO_INVERT_MATRIX;
}
incr++;
memcpy(b, backup, MMAX * KMAX);
for (i = nsrcerrs; i < nerrs - nsrcerrs; i++) {
if (src_err_list[i] == (decode_index[k - 1] + incr)) {
// skip the erased parity line
incr++;
continue;
}
}
if (decode_index[k - 1] + incr >= m) {
free(b);
free(backup);
printf("BAD MATRIX\n");
return NO_INVERT_MATRIX;
}
decode_index[k - 1] += incr;
for (j = 0; j < k; j++)
b[k * (k - 1) + j] = encode_matrix[k * decode_index[k - 1] + j];
};
for (i = 0; i < nsrcerrs; i++) {
for (j = 0; j < k; j++) {
decode_matrix[k * i + j] = invert_matrix[k * src_err_list[i] + j];
}
}
/* src_err_list from encode_matrix * invert of b for parity decoding */
for (p = nsrcerrs; p < nerrs; p++) {
for (i = 0; i < k; i++) {
s = 0;
for (j = 0; j < k; j++)
s ^= gf_mul(invert_matrix[j * k + i],
encode_matrix[k * src_err_list[p] + j]);
decode_matrix[k * p + i] = s;
}
}
free(b);
free(backup);
return 0;
}
int main(int argc, char *argv[])
{
int re = 0;
int i, j, p, rtest, m, k;
int nerrs, nsrcerrs;
void *buf;
unsigned int decode_index[MMAX];
unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
unsigned char *update_buffs[TEST_SOURCES];
unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
unsigned char *recov[TEST_SOURCES];
int rows, align, size;
unsigned char *efence_buffs[TEST_SOURCES];
unsigned char *efence_update_buffs[TEST_SOURCES];
unsigned int offset;
u8 *ubuffs[TEST_SOURCES];
u8 *update_ubuffs[TEST_SOURCES];
u8 *temp_ubuffs[TEST_SOURCES];
printf("test " xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
srand(TEST_SEED);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
temp_buffs[i] = buf;
memset(temp_buffs[i], 0, TEST_LEN); // initialize the destination buffer to be zero for update function
}
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
update_buffs[i] = buf;
memset(update_buffs[i], 0, TEST_LEN); // initialize the destination buffer to be zero for update function
}
// Test erasure code by encode and recovery
encode_matrix = malloc(MMAX * KMAX);
decode_matrix = malloc(MMAX * KMAX);
invert_matrix = malloc(MMAX * KMAX);
g_tbls = malloc(KMAX * TEST_SOURCES * 32);
if (encode_matrix == NULL || decode_matrix == NULL
|| invert_matrix == NULL || g_tbls == NULL) {
printf("Test failure! Error with malloc\n");
return -1;
}
// Pick a first test
m = 15;
k = 10;
if (m > MMAX || k > KMAX)
return -1;
// Make random data
for (i = 0; i < k; i++) {
for (j = 0; j < TEST_LEN; j++) {
buffs[i][j] = rand();
update_buffs[i][j] = buffs[i][j];
}
}
// Generate encode matrix encode_matrix
// The matrix generated by gf_gen_rs_matrix
// is not always invertable.
gf_gen_rs_matrix(encode_matrix, m, k);
// Generate g_tbls from encode matrix encode_matrix
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix encode_matrix
REF_FUNCTION(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(TEST_LEN, k, m - k, i, g_tbls, update_buffs[i],
&update_buffs[k]);
}
for (i = 0; i < m - k; i++) {
if (0 != memcmp(update_buffs[k + i], buffs[k + i], TEST_LEN)) {
printf("\nupdate_buffs%d :", i);
dump(update_buffs[k + i], 25);
printf("buffs%d :", i);
dump(buffs[k + i], 25);
return -1;
}
}
// Choose random buffers to be in erasure
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list, src_in_err,
nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = update_buffs[decode_index[i]];
}
// Recover data
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
REF_FUNCTION(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], update_buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
printf("orig :");
dump(update_buffs[src_err_list[i]], 25);
return -1;
}
}
putchar('.');
// Pick a first test
m = 7;
k = 5;
if (m > MMAX || k > KMAX)
return -1;
// Zero the destination buffer for update function
for (i = k; i < TEST_SOURCES; i++) {
memset(buffs[i], 0, TEST_LEN);
memset(update_buffs[i], 0, TEST_LEN);
}
// Make random data
for (i = 0; i < k; i++) {
for (j = 0; j < TEST_LEN; j++) {
buffs[i][j] = rand();
update_buffs[i][j] = buffs[i][j];
}
}
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Generate g_tbls from encode matrix encode_matrix
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix encode_matrix
REF_FUNCTION(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(TEST_LEN, k, m - k, i, g_tbls, update_buffs[i],
&update_buffs[k]);
}
for (i = 0; i < m - k; i++) {
if (0 != memcmp(update_buffs[k + i], buffs[k + i], TEST_LEN)) {
printf("\nupdate_buffs%d :", i);
dump(update_buffs[k + i], 25);
printf("buffs%d :", i);
dump(buffs[k + i], 25);
return -1;
}
}
// Choose random buffers to be in erasure
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list, src_in_err,
nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = update_buffs[decode_index[i]];
}
// Recover data
for (i = 0; i < TEST_SOURCES; i++) {
memset(temp_buffs[i], 0, TEST_LEN);
}
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(TEST_LEN, k, nerrs, i, g_tbls, recov[i], &temp_buffs[k]);
}
for (i = 0; i < nerrs; i++) {
if (0 != memcmp(temp_buffs[k + i], update_buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
printf("orig :");
dump(update_buffs[src_err_list[i]], 25);
return -1;
}
}
putchar('.');
// Do more random tests
for (rtest = 0; rtest < RANDOMS; rtest++) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
// Zero the destination buffer for update function
for (i = k; i < TEST_SOURCES; i++) {
memset(buffs[i], 0, TEST_LEN);
memset(update_buffs[i], 0, TEST_LEN);
}
// Make random data
for (i = 0; i < k; i++) {
for (j = 0; j < TEST_LEN; j++) {
buffs[i][j] = rand();
update_buffs[i][j] = buffs[i][j];
}
}
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
REF_FUNCTION(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(TEST_LEN, k, m - k, i, g_tbls, update_buffs[i],
&update_buffs[k]);
}
for (i = 0; i < m - k; i++) {
if (0 != memcmp(update_buffs[k + i], buffs[k + i], TEST_LEN)) {
printf("\nupdate_buffs%d :", i);
dump(update_buffs[k + i], 25);
printf("buffs%d :", i);
dump(buffs[k + i], 25);
return -1;
}
}
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = update_buffs[decode_index[i]];
}
// Recover data
for (i = 0; i < TEST_SOURCES; i++) {
memset(temp_buffs[i], 0, TEST_LEN);
}
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(TEST_LEN, k, nerrs, i, g_tbls, recov[i],
&temp_buffs[k]);
}
for (i = 0; i < nerrs; i++) {
if (0 !=
memcmp(temp_buffs[k + i], update_buffs[src_err_list[i]],
TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("orig data:\n");
dump_matrix(update_buffs, m, 25);
printf("orig :");
dump(update_buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
return -1;
}
}
putchar('.');
}
// Run tests at end of buffer for Electric Fence
k = 16;
align = (LEN_ALIGN_CHK_B != 0) ? 1 : ALIGN_SIZE;
if (k > KMAX)
return -1;
for (rows = 1; rows <= 16; rows++) {
m = k + rows;
if (m > MMAX)
return -1;
for (i = k; i < TEST_SOURCES; i++) {
memset(buffs[i], 0, TEST_LEN);
memset(update_buffs[i], 0, TEST_LEN);
}
// Make random data
for (i = 0; i < k; i++) {
for (j = 0; j < TEST_LEN; j++) {
buffs[i][j] = rand();
update_buffs[i][j] = buffs[i][j];
}
}
for (size = 0; size <= TEST_SIZE; size += align) {
for (i = 0; i < m; i++) { // Line up TEST_SIZE from end
efence_buffs[i] = buffs[i] + TEST_LEN - size;
efence_update_buffs[i] = update_buffs[i] + TEST_LEN - size;
}
// Zero the destination buffer for update function
for (i = k; i < m; i++) {
memset(efence_buffs[i], 0, size);
memset(efence_update_buffs[i], 0, size);
}
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
REF_FUNCTION(size, k, m - k, g_tbls, efence_buffs, &efence_buffs[k]);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(size, k, m - k, i, g_tbls,
efence_update_buffs[i],
&efence_update_buffs[k]);
}
for (i = 0; i < m - k; i++) {
if (0 !=
memcmp(efence_update_buffs[k + i], efence_buffs[k + i],
size)) {
printf("\nefence_update_buffs%d :", i);
dump(efence_update_buffs[k + i], 25);
printf("efence_buffs%d :", i);
dump(efence_buffs[k + i], 25);
return -1;
}
}
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = efence_update_buffs[decode_index[i]];
}
// Recover data
for (i = 0; i < TEST_SOURCES; i++) {
memset(temp_buffs[i], 0, TEST_LEN);
}
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(size, k, nerrs, i, g_tbls, recov[i],
&temp_buffs[k]);
}
for (i = 0; i < nerrs; i++) {
if (0 !=
memcmp(temp_buffs[k + i],
efence_update_buffs[src_err_list[i]], size)) {
printf("Efence: Fail error recovery (%d, %d, %d)\n", m,
k, nerrs);
printf("size = %d\n", size);
printf("Test erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((u8 *) encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((u8 *) decode_matrix, m, k);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], align);
printf("orig :");
dump(efence_update_buffs[src_err_list[i]], align);
return -1;
}
}
}
putchar('.');
}
// Test rand ptr alignment if available
for (rtest = 0; rtest < RANDOMS; rtest++) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
// Add random offsets
for (i = 0; i < m; i++) {
memset(buffs[i], 0, TEST_LEN); // zero pad to check write-over
memset(update_buffs[i], 0, TEST_LEN); // zero pad to check write-over
memset(temp_buffs[i], 0, TEST_LEN); // zero pad to check write-over
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
update_ubuffs[i] =
update_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
temp_ubuffs[i] = temp_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
}
// Zero the destination buffer for update function
for (i = k; i < m; i++) {
memset(ubuffs[i], 0, size);
memset(update_ubuffs[i], 0, size);
}
// Make random data
for (i = 0; i < k; i++) {
for (j = 0; j < size; j++) {
ubuffs[i][j] = rand();
update_ubuffs[i][j] = ubuffs[i][j];
}
}
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
REF_FUNCTION(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(size, k, m - k, i, g_tbls, update_ubuffs[i],
&update_ubuffs[k]);
}
for (i = 0; i < m - k; i++) {
if (0 != memcmp(update_ubuffs[k + i], ubuffs[k + i], size)) {
printf("\nupdate_ubuffs%d :", i);
dump(update_ubuffs[k + i], 25);
printf("ubuffs%d :", i);
dump(ubuffs[k + i], 25);
return -1;
}
}
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = update_ubuffs[decode_index[i]];
}
// Recover data
for (i = 0; i < m; i++) {
memset(temp_ubuffs[i], 0, size);
}
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(size, k, nerrs, i, g_tbls, recov[i],
&temp_ubuffs[k]);
}
for (i = 0; i < nerrs; i++) {
if (0 !=
memcmp(temp_ubuffs[k + i], update_ubuffs[src_err_list[i]], size)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((unsigned char *)encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((unsigned char *)invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((unsigned char *)decode_matrix, m, k);
printf("orig data:\n");
dump_matrix(update_ubuffs, m, 25);
printf("orig :");
dump(update_ubuffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_ubuffs[k + i], 25);
return -1;
}
}
// Confirm that padding around dests is unchanged
memset(temp_buffs[0], 0, PTR_ALIGN_CHK_B); // Make reference zero buff
for (i = 0; i < m; i++) {
offset = update_ubuffs[i] - update_buffs[i];
if (memcmp(update_buffs[i], temp_buffs[0], offset)) {
printf("Fail rand ualign encode pad start\n");
return -1;
}
if (memcmp
(update_buffs[i] + offset + size, temp_buffs[0],
PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign encode pad end\n");
return -1;
}
}
for (i = 0; i < nerrs; i++) {
offset = temp_ubuffs[k + i] - temp_buffs[k + i];
if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
printf("Fail rand ualign decode pad start\n");
return -1;
}
if (memcmp
(temp_buffs[k + i] + offset + size, temp_buffs[0],
PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign decode pad end\n");
return -1;
}
}
putchar('.');
}
// Test size alignment
align = (LEN_ALIGN_CHK_B != 0) ? 13 : ALIGN_SIZE;
for (size = TEST_LEN; size >= 0; size -= align) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
// Zero the destination buffer for update function
for (i = k; i < TEST_SOURCES; i++) {
memset(buffs[i], 0, size);
memset(update_buffs[i], 0, size);
}
// Make random data
for (i = 0; i < k; i++) {
for (j = 0; j < size; j++) {
buffs[i][j] = rand();
update_buffs[i][j] = buffs[i][j];
}
}
// The matrix generated by gf_gen_cauchy1_matrix
// is always invertable.
gf_gen_cauchy1_matrix(encode_matrix, m, k);
// Make parity vects
// Generate g_tbls from encode matrix a
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
// Perform matrix dot_prod for EC encoding
// using g_tbls from encode matrix a
REF_FUNCTION(size, k, m - k, g_tbls, buffs, &buffs[k]);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(size, k, m - k, i, g_tbls, update_buffs[i],
&update_buffs[k]);
}
for (i = 0; i < m - k; i++) {
if (0 != memcmp(update_buffs[k + i], buffs[k + i], size)) {
printf("\nupdate_buffs%d (size=%d) :", i, size);
dump(update_buffs[k + i], 25);
printf("buffs%d (size=%d) :", i, size);
dump(buffs[k + i], 25);
return -1;
}
}
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
// Generate decode matrix
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
invert_matrix, decode_index, src_err_list,
src_in_err, nerrs, nsrcerrs, k, m);
if (re != 0) {
printf("Fail to gf_gen_decode_matrix\n");
return -1;
}
// Pack recovery array as list of valid sources
// Its order must be the same as the order
// to generate matrix b in gf_gen_decode_matrix
for (i = 0; i < k; i++) {
recov[i] = update_buffs[decode_index[i]];
}
// Recover data
for (i = 0; i < TEST_SOURCES; i++) {
memset(temp_buffs[i], 0, TEST_LEN);
}
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
for (i = 0; i < k; i++) {
FUNCTION_UNDER_TEST(size, k, nerrs, i, g_tbls, recov[i],
&temp_buffs[k]);
}
for (i = 0; i < nerrs; i++) {
if (0 !=
memcmp(temp_buffs[k + i], update_buffs[src_err_list[i]], size)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (j = 0; j < nerrs; j++)
printf(" %d", src_err_list[j]);
printf(" - Index = ");
for (p = 0; p < k; p++)
printf(" %d", decode_index[p]);
printf("\nencode_matrix:\n");
dump_u8xu8((unsigned char *)encode_matrix, m, k);
printf("inv b:\n");
dump_u8xu8((unsigned char *)invert_matrix, k, k);
printf("\ndecode_matrix:\n");
dump_u8xu8((unsigned char *)decode_matrix, m, k);
printf("orig data:\n");
dump_matrix(update_buffs, m, 25);
printf("orig :");
dump(update_buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buffs[k + i], 25);
return -1;
}
}
putchar('.');
}
printf("done EC tests: Pass\n");
return 0;
}

View File

@ -0,0 +1,337 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_2vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r9
%define tmp4 r12 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
%endmacro
%macro FUNC_RESTORE 0
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define stack_size 3*16 + 3*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
save_xmm128 xmm6, 0*16
save_xmm128 xmm7, 1*16
save_xmm128 xmm8, 2*16
save_reg r12, 3*16 + 0*8
save_reg r13, 3*16 + 1*8
save_reg r14, 3*16 + 2*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp + 0*16]
vmovdqa xmm7, [rsp + 1*16]
vmovdqa xmm8, [rsp + 2*16]
mov r12, [rsp + 3*16 + 0*8]
mov r13, [rsp + 3*16 + 1*8]
mov r14, [rsp + 3*16 + 2*8]
add rsp, stack_size
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, elf32
;;;================== High Address;
;;; arg4
;;; arg3
;;; arg2
;;; arg1
;;; arg0
;;; return
;;;<================= esp of caller
;;; ebp
;;;<================= ebp = esp
;;; var0
;;; esi
;;; edi
;;; ebx
;;;<================= esp of callee
;;;
;;;================== Low Address;
%define PS 4
%define LOG_PS 2
%define func(x) x:
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
%define trans ecx
%define trans2 esi
%define arg0 trans ;trans and trans2 are for the variables in stack
%define arg0_m arg(0)
%define arg1 ebx
%define arg2 arg2_m
%define arg2_m arg(2)
%define arg3 trans
%define arg3_m arg(3)
%define arg4 trans
%define arg4_m arg(4)
%define tmp edx
%define tmp2 edi
%define tmp3 trans2
%define tmp4 trans2
%define tmp4_m var(0)
%define return eax
%macro SLDR 2 ;; stack load/restore
mov %1, %2
%endmacro
%define SSTR SLDR
%macro FUNC_SAVE 0
push ebp
mov ebp, esp
sub esp, PS*1 ;1 local variable
push esi
push edi
push ebx
mov arg1, arg(1)
%endmacro
%macro FUNC_RESTORE 0
pop ebx
pop edi
pop esi
add esp, PS*1 ;1 local variable
pop ebp
%endmacro
%endif ; output formats
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest1 arg4
%define vec_i tmp2
%define ptr tmp3
%define dest2 tmp4
%define pos return
%ifidn PS,4 ;32-bit code
%define len_m arg0_m
%define src_m arg3_m
%define dest1_m arg4_m
%define dest2_m tmp4_m
%endif
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
%ifidn PS,8 ; 64-bit code
default rel
[bits 64]
%endif
section .text
%ifidn PS,8 ;64-bit code
%define xmask0f xmm8
%define xgft1_lo xmm7
%define xgft1_hi xmm6
%define xgft2_lo xmm5
%define xgft2_hi xmm4
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm2
%define xp2 xmm3
%else ;32-bit code
%define xmask0f xmm4
%define xgft1_lo xmm7
%define xgft1_hi xmm6
%define xgft2_lo xgft1_lo
%define xgft2_hi xgft1_hi
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm2
%define xp2 xmm3
%endif
align 16
global gf_2vect_dot_prod_avx:function
func(gf_2vect_dot_prod_avx)
FUNC_SAVE
SLDR len, len_m
sub len, 16
SSTR len_m, len
jl .return_fail
xor pos, pos
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
SLDR dest1, dest1_m
mov dest2, [dest1+PS]
SSTR dest2_m, dest2
mov dest1, [dest1]
SSTR dest1_m, dest1
.loop16
vpxor xp1, xp1
vpxor xp2, xp2
mov tmp, mul_array
xor vec_i, vec_i
.next_vect
SLDR src, src_m
mov ptr, [src+vec_i]
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
%ifidn PS,8 ; 64-bit code
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
add tmp, 32
add vec_i, PS
%endif
XLDR x0, [ptr+pos] ;Get next source vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp1, xgft1_hi ;xp1 += partial
%ifidn PS,4 ; 32-bit code
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
add tmp, 32
add vec_i, PS
%endif
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp2, xgft2_hi ;xp2 += partial
cmp vec_i, vec
jl .next_vect
SLDR dest1, dest1_m
SLDR dest2, dest2_m
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
SLDR len, len_m
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop16 ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_2vect_dot_prod_avx, 02, 05, 0191

View File

@ -0,0 +1,356 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_2vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 r9
%define tmp4 r12 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
%endmacro
%macro FUNC_RESTORE 0
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define stack_size 3*16 + 3*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
vmovdqa [rsp + 0*16], xmm6
vmovdqa [rsp + 1*16], xmm7
vmovdqa [rsp + 2*16], xmm8
save_reg r12, 3*16 + 0*8
save_reg r13, 3*16 + 1*8
save_reg r14, 3*16 + 2*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp + 0*16]
vmovdqa xmm7, [rsp + 1*16]
vmovdqa xmm8, [rsp + 2*16]
mov r12, [rsp + 3*16 + 0*8]
mov r13, [rsp + 3*16 + 1*8]
mov r14, [rsp + 3*16 + 2*8]
add rsp, stack_size
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, elf32
;;;================== High Address;
;;; arg4
;;; arg3
;;; arg2
;;; arg1
;;; arg0
;;; return
;;;<================= esp of caller
;;; ebp
;;;<================= ebp = esp
;;; var0
;;; esi
;;; edi
;;; ebx
;;;<================= esp of callee
;;;
;;;================== Low Address;
%define PS 4
%define LOG_PS 2
%define func(x) x:
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
%define trans ecx
%define trans2 esi
%define arg0 trans ;trans and trans2 are for the variables in stack
%define arg0_m arg(0)
%define arg1 ebx
%define arg2 arg2_m
%define arg2_m arg(2)
%define arg3 trans
%define arg3_m arg(3)
%define arg4 trans
%define arg4_m arg(4)
%define tmp edx
%define tmp.w edx
%define tmp.b dl
%define tmp2 edi
%define tmp3 trans2
%define tmp4 trans2
%define tmp4_m var(0)
%define return eax
%macro SLDR 2 ;stack load/restore
mov %1, %2
%endmacro
%define SSTR SLDR
%macro FUNC_SAVE 0
push ebp
mov ebp, esp
sub esp, PS*1 ;1 local variable
push esi
push edi
push ebx
mov arg1, arg(1)
%endmacro
%macro FUNC_RESTORE 0
pop ebx
pop edi
pop esi
add esp, PS*1 ;1 local variable
pop ebp
%endmacro
%endif ; output formats
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest1 arg4
%define vec_i tmp2
%define ptr tmp3
%define dest2 tmp4
%define pos return
%ifidn PS,4 ;32-bit code
%define len_m arg0_m
%define src_m arg3_m
%define dest1_m arg4_m
%define dest2_m tmp4_m
%endif
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
%ifidn PS,8 ;64-bit code
default rel
[bits 64]
%endif
section .text
%ifidn PS,8 ;64-bit code
%define xmask0f ymm8
%define xmask0fx xmm8
%define xgft1_lo ymm7
%define xgft1_hi ymm6
%define xgft2_lo ymm5
%define xgft2_hi ymm4
%define x0 ymm0
%define xtmpa ymm1
%define xp1 ymm2
%define xp2 ymm3
%else ;32-bit code
%define xmask0f ymm7
%define xmask0fx xmm7
%define xgft1_lo ymm5
%define xgft1_hi ymm4
%define xgft2_lo xgft1_lo
%define xgft2_hi xgft1_hi
%define x0 ymm0
%define xtmpa ymm1
%define xp1 ymm2
%define xp2 ymm3
%endif
align 16
global gf_2vect_dot_prod_avx2:function
func(gf_2vect_dot_prod_avx2)
FUNC_SAVE
SLDR len, len_m
sub len, 32
SSTR len_m, len
jl .return_fail
xor pos, pos
mov tmp.b, 0x0f
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
SLDR dest1, dest1_m
mov dest2, [dest1+PS]
SSTR dest2_m, dest2
mov dest1, [dest1]
SSTR dest1_m, dest1
.loop32
vpxor xp1, xp1
vpxor xp2, xp2
mov tmp, mul_array
xor vec_i, vec_i
.next_vect
SLDR src, src_m
mov ptr, [src+vec_i]
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
; " Ax{00}, Ax{10}, ..., Ax{f0}
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
%ifidn PS,8 ; 64-bit code
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
XLDR x0, [ptr+pos] ;Get next source vector
add tmp, 32
add vec_i, PS
%else
XLDR x0, [ptr+pos] ;Get next source vector
%endif
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp1, xgft1_hi ;xp1 += partial
%ifidn PS,4 ; 32-bit code
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
add tmp, 32
add vec_i, PS
%endif
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp2, xgft2_hi ;xp2 += partial
cmp vec_i, vec
jl .next_vect
SLDR dest1, dest1_m
SLDR dest2, dest2_m
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
SLDR len, len_m
add pos, 32 ;Loop on 32 bytes at a time
cmp pos, len
jle .loop32
lea tmp, [len + 32]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop32 ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
;;; func core, ver, snum
slversion gf_2vect_dot_prod_avx2, 04, 05, 0196

View File

@ -0,0 +1,339 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_2vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r9
%define tmp4 r12 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
%endmacro
%macro FUNC_RESTORE 0
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define stack_size 3*16 + 3*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
save_xmm128 xmm6, 0*16
save_xmm128 xmm7, 1*16
save_xmm128 xmm8, 2*16
save_reg r12, 3*16 + 0*8
save_reg r13, 3*16 + 1*8
save_reg r14, 3*16 + 2*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp + 0*16]
movdqa xmm7, [rsp + 1*16]
movdqa xmm8, [rsp + 2*16]
mov r12, [rsp + 3*16 + 0*8]
mov r13, [rsp + 3*16 + 1*8]
mov r14, [rsp + 3*16 + 2*8]
add rsp, stack_size
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, elf32
;;;================== High Address;
;;; arg4
;;; arg3
;;; arg2
;;; arg1
;;; arg0
;;; return
;;;<================= esp of caller
;;; ebp
;;;<================= ebp = esp
;;; var0
;;; esi
;;; edi
;;; ebx
;;;<================= esp of callee
;;;
;;;================== Low Address;
%define PS 4
%define LOG_PS 2
%define func(x) x:
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
%define trans ecx
%define trans2 esi
%define arg0 trans ;trans and trans2 are for the variables in stack
%define arg0_m arg(0)
%define arg1 ebx
%define arg2 arg2_m
%define arg2_m arg(2)
%define arg3 trans
%define arg3_m arg(3)
%define arg4 trans
%define arg4_m arg(4)
%define tmp edx
%define tmp2 edi
%define tmp3 trans2
%define tmp4 trans2
%define tmp4_m var(0)
%define return eax
%macro SLDR 2 ;; stack load/restore
mov %1, %2
%endmacro
%define SSTR SLDR
%macro FUNC_SAVE 0
push ebp
mov ebp, esp
sub esp, PS*1 ;1 local variable
push esi
push edi
push ebx
mov arg1, arg(1)
%endmacro
%macro FUNC_RESTORE 0
pop ebx
pop edi
pop esi
add esp, PS*1 ;1 local variable
pop ebp
%endmacro
%endif ; output formats
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest1 arg4
%define vec_i tmp2
%define ptr tmp3
%define dest2 tmp4
%define pos return
%ifidn PS,4 ;32-bit code
%define len_m arg0_m
%define src_m arg3_m
%define dest1_m arg4_m
%define dest2_m tmp4_m
%endif
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR movdqu
%define XSTR movdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR movdqa
%define XSTR movdqa
%else
%define XLDR movntdqa
%define XSTR movntdq
%endif
%endif
%ifidn PS,8 ;64-bit code
default rel
[bits 64]
%endif
section .text
%ifidn PS,8 ;64-bit code
%define xmask0f xmm8
%define xgft1_lo xmm7
%define xgft1_hi xmm6
%define xgft2_lo xmm5
%define xgft2_hi xmm4
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm2
%define xp2 xmm3
%else ;32-bit code
%define xmask0f xmm4
%define xgft1_lo xmm7
%define xgft1_hi xmm6
%define xgft2_lo xgft1_lo
%define xgft2_hi xgft1_hi
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm2
%define xp2 xmm3
%endif
align 16
global gf_2vect_dot_prod_sse:function
func(gf_2vect_dot_prod_sse)
FUNC_SAVE
SLDR len, len_m
sub len, 16
SSTR len_m, len
jl .return_fail
xor pos, pos
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
SLDR dest1, dest1_m
mov dest2, [dest1+PS]
SSTR dest2_m, dest2
mov dest1, [dest1]
SSTR dest1_m, dest1
.loop16
pxor xp1, xp1
pxor xp2, xp2
mov tmp, mul_array
xor vec_i, vec_i
.next_vect
SLDR src, src_m
mov ptr, [src+vec_i]
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
%ifidn PS,8 ;64-bit code
movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
add tmp, 32
add vec_i, PS
%endif
XLDR x0, [ptr+pos] ;Get next source vector
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
pxor xp1, xgft1_hi ;xp1 += partial
%ifidn PS,4 ;32-bit code
movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
add tmp, 32
add vec_i, PS
%endif
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
pxor xp2, xgft2_hi ;xp2 += partial
cmp vec_i, vec
jl .next_vect
SLDR dest1, dest1_m
SLDR dest2, dest2_m
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
SLDR len, len_m
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop16 ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_2vect_dot_prod_sse, 00, 04, 0062

View File

@ -0,0 +1,216 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "test.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_2vect_dot_prod_sse
#endif
#define str(s) #s
#define xstr(s) str(s)
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 10
# define TEST_LEN 8*1024
# define TEST_LOOPS 40000
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 10
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
# define TEST_LOOPS 100
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS 1000
# endif
# endif
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j;
void *buf;
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g_tbls[2 * TEST_SOURCES * 32];
u8 *dest1, *dest2, *dest_ref1, *dest_ref2, *dest_ptrs[2];
u8 *buffs[TEST_SOURCES];
struct perf start, stop;
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref2 = buf;
dest_ptrs[0] = dest1;
dest_ptrs[1] = dest2;
// Performance test
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
memset(dest1, 0, TEST_LEN);
memset(dest2, 0, TEST_LEN);
memset(dest_ref1, 0, TEST_LEN);
memset(dest_ref2, 0, TEST_LEN);
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
}
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
dest_ref2);
#ifdef DO_REF_PERF
perf_start(&start);
for (i = 0; i < TEST_LOOPS / 100; i++) {
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
buffs, dest_ref2);
}
perf_stop(&stop);
printf("gf_2vect_dot_prod_base" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 2) * i);
#endif
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
}
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 2) * i);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
printf("pass perf check\n");
return 0;
}

View File

@ -0,0 +1,477 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_2vect_dot_prod_sse
#endif
#ifndef TEST_MIN_SIZE
# define TEST_MIN_SIZE 16
#endif
#define str(s) #s
#define xstr(s) str(s)
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#define TEST_MEM TEST_SIZE
#define TEST_LOOPS 10000
#define TEST_TYPE_STR ""
#ifndef TEST_SOURCES
# define TEST_SOURCES 16
#endif
#ifndef RANDOMS
# define RANDOMS 20
#endif
#ifdef EC_ALIGNED_ADDR
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 0
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
#else
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 32
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j, rtest, srcs;
void *buf;
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g_tbls[2 * TEST_SOURCES * 32];
u8 *dest1, *dest2, *dest_ref1, *dest_ref2, *dest_ptrs[2];
u8 *buffs[TEST_SOURCES];
int align, size;
unsigned char *efence_buffs[TEST_SOURCES];
unsigned int offset;
u8 *ubuffs[TEST_SOURCES];
u8 *udest_ptrs[2];
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref2 = buf;
dest_ptrs[0] = dest1;
dest_ptrs[1] = dest2;
// Test of all zeros
for (i = 0; i < TEST_SOURCES; i++)
memset(buffs[i], 0, TEST_LEN);
memset(dest1, 0, TEST_LEN);
memset(dest2, 0, TEST_LEN);
memset(dest_ref1, 0, TEST_LEN);
memset(dest_ref2, 0, TEST_LEN);
memset(g1, 2, TEST_SOURCES);
memset(g2, 1, TEST_SOURCES);
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
dest_ref2);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
putchar('.');
// Rand data test
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
}
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
buffs, dest_ref2);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
putchar('.');
}
// Rand data test with varied parameters
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
for (i = 0; i < srcs; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
dest_ref2);
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test1 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test2 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
putchar('.');
}
}
// Run tests at end of buffer for Electric Fence
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
efence_buffs[i] = buffs[i] + TEST_LEN - size;
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
}
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
}
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
efence_buffs, dest_ref2);
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, align);
printf("dprod_dut:");
dump(dest1, align);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, align);
printf("dprod_dut:");
dump(dest2, align);
return -1;
}
putchar('.');
}
// Test rand ptr alignment if available
for (rtest = 0; rtest < RANDOMS; rtest++) {
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
srcs = rand() % TEST_SOURCES;
if (srcs == 0)
continue;
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
// Add random offsets
for (i = 0; i < srcs; i++)
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
memset(dest1, 0, TEST_LEN); // zero pad to check write-over
memset(dest2, 0, TEST_LEN);
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
ubuffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
if (memcmp(dest_ref1, udest_ptrs[0], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(udest_ptrs[0], 25);
return -1;
}
if (memcmp(dest_ref2, udest_ptrs[1], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(udest_ptrs[1], 25);
return -1;
}
// Confirm that padding around dests is unchanged
memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
offset = udest_ptrs[0] - dest1;
if (memcmp(dest1, dest_ref1, offset)) {
printf("Fail rand ualign pad1 start\n");
return -1;
}
if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad1 end\n");
return -1;
}
offset = udest_ptrs[1] - dest2;
if (memcmp(dest2, dest_ref1, offset)) {
printf("Fail rand ualign pad2 start\n");
return -1;
}
if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad2 end\n");
return -1;
}
putchar('.');
}
// Test all size alignment
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
srcs = TEST_SOURCES;
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
if (memcmp(dest_ref1, dest_ptrs[0], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest_ptrs[0], 25);
return -1;
}
if (memcmp(dest_ref2, dest_ptrs[1], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest_ptrs[1], 25);
return -1;
}
}
printf("Pass\n");
return 0;
}

View File

@ -0,0 +1,236 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_2vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define tmp2 r10
%define return rax
%define return.w eax
%define stack_size 16*9 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
movdqa [rsp+16*0],xmm6
movdqa [rsp+16*1],xmm7
movdqa [rsp+16*2],xmm8
movdqa [rsp+16*3],xmm9
movdqa [rsp+16*4],xmm10
movdqa [rsp+16*5],xmm11
movdqa [rsp+16*6],xmm12
movdqa [rsp+16*7],xmm13
movdqa [rsp+16*8],xmm14
save_reg r12, 9*16 + 0*8
save_reg r15, 9*16 + 1*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp+16*0]
movdqa xmm7, [rsp+16*1]
movdqa xmm8, [rsp+16*2]
movdqa xmm9, [rsp+16*3]
movdqa xmm10, [rsp+16*4]
movdqa xmm11, [rsp+16*5]
movdqa xmm12, [rsp+16*6]
movdqa xmm13, [rsp+16*7]
movdqa xmm14, [rsp+16*8]
mov r12, [rsp + 9*16 + 0*8]
mov r15, [rsp + 9*16 + 1*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define return rax
%define return.w eax
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
;;; gf_2vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 tmp2
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm14
%define xgft1_lo xmm13
%define xgft1_hi xmm12
%define xgft2_lo xmm11
%define xgft2_hi xmm10
%define x0 xmm0
%define xtmpa xmm1
%define xtmph1 xmm2
%define xtmpl1 xmm3
%define xtmph2 xmm4
%define xtmpl2 xmm5
%define xd1 xmm6
%define xd2 xmm7
%define xtmpd1 xmm8
%define xtmpd2 xmm9
align 16
global gf_2vect_mad_avx:function
func(gf_2vect_mad_avx)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
sal vec_i, 5 ;Multiply by 32
sal vec, 5
lea tmp, [mul_array + vec_i]
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
vmovdqu xgft2_hi, [tmp+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
mov dest2, [dest1+PS]
mov dest1, [dest1]
XLDR xtmpd1, [dest1+len] ;backup the last 16 bytes in dest
XLDR xtmpd2, [dest2+len] ;backup the last 16 bytes in dest
.loop16
XLDR xd1, [dest1+pos] ;Get next dest vector
XLDR xd2, [dest2+pos] ;Get next dest vector
.loop16_overlap:
XLDR x0, [src+pos] ;Get next source vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
vpxor xd1, xd1, xtmph1 ;xd1 += partial
vpshufb xtmph2, xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl2, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
vpxor xd2, xd2, xtmph2 ;xd2 += partial
XSTR [dest1+pos], xd1
XSTR [dest2+pos], xd2
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
vmovdqa xd1, xtmpd1 ;Restore xd1
vmovdqa xd2, xtmpd2 ;Restore xd2
jmp .loop16_overlap ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_2vect_mad_avx, 02, 01, 0204

View File

@ -0,0 +1,247 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_2vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define return rax
%define return.w eax
%define stack_size 16*9 + 3*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
vmovdqa [rsp+16*0],xmm6
vmovdqa [rsp+16*1],xmm7
vmovdqa [rsp+16*2],xmm8
vmovdqa [rsp+16*3],xmm9
vmovdqa [rsp+16*4],xmm10
vmovdqa [rsp+16*5],xmm11
vmovdqa [rsp+16*6],xmm12
vmovdqa [rsp+16*7],xmm13
vmovdqa [rsp+16*8],xmm14
save_reg r12, 9*16 + 0*8
save_reg r15, 9*16 + 1*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp+16*0]
vmovdqa xmm7, [rsp+16*1]
vmovdqa xmm8, [rsp+16*2]
vmovdqa xmm9, [rsp+16*3]
vmovdqa xmm10, [rsp+16*4]
vmovdqa xmm11, [rsp+16*5]
vmovdqa xmm12, [rsp+16*6]
vmovdqa xmm13, [rsp+16*7]
vmovdqa xmm14, [rsp+16*8]
mov r12, [rsp + 9*16 + 0*8]
mov r15, [rsp + 9*16 + 1*8]
add rsp, stack_size
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define return rax
%define return.w eax
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
;;; gf_2vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 tmp2
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f ymm14
%define xmask0fx xmm14
%define xgft1_lo ymm13
%define xgft1_hi ymm12
%define xgft2_lo ymm11
%define xgft2_hi ymm10
%define x0 ymm0
%define xtmpa ymm1
%define xtmph1 ymm2
%define xtmpl1 ymm3
%define xtmph2 ymm4
%define xtmpl2 ymm5
%define xd1 ymm6
%define xd2 ymm7
%define xtmpd1 ymm8
%define xtmpd2 ymm9
align 16
global gf_2vect_mad_avx2:function
func(gf_2vect_mad_avx2)
FUNC_SAVE
sub len, 32
jl .return_fail
xor pos, pos
mov tmp.b, 0x0f
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
sal vec_i, 5 ;Multiply by 32
sal vec, 5
lea tmp, [mul_array + vec_i]
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
; " Ax{00}, Ax{10}, ..., Ax{f0}
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
mov dest2, [dest1+PS] ; reuse mul_array
mov dest1, [dest1]
XLDR xtmpd1, [dest1+len] ;backup the last 16 bytes in dest
XLDR xtmpd2, [dest2+len] ;backup the last 16 bytes in dest
.loop32
XLDR xd1, [dest1+pos] ;Get next dest vector
XLDR xd2, [dest2+pos] ;Get next dest vector
.loop32_overlap:
XLDR x0, [src+pos] ;Get next source vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
vpxor xd1, xd1, xtmph1 ;xd1 += partial
vpshufb xtmph2, xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl2, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
vpxor xd2, xd2, xtmph2 ;xd2 += partial
XSTR [dest1+pos], xd1
XSTR [dest2+pos], xd2
add pos, 32 ;Loop on 32 bytes at a time
cmp pos, len
jle .loop32
lea tmp, [len + 32]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-32
vmovdqa xd1, xtmpd1 ;Restore xd1
vmovdqa xd2, xtmpd2 ;Restore xd2
jmp .loop32_overlap ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
;;; func core, ver, snum
slversion gf_2vect_mad_avx2, 04, 01, 0205

View File

@ -0,0 +1,239 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define tmp2 r10
%define return rax
%define return.w eax
%define stack_size 16*9 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
movdqa [rsp+16*0],xmm6
movdqa [rsp+16*1],xmm7
movdqa [rsp+16*2],xmm8
movdqa [rsp+16*3],xmm9
movdqa [rsp+16*4],xmm10
movdqa [rsp+16*5],xmm11
movdqa [rsp+16*6],xmm12
movdqa [rsp+16*7],xmm13
movdqa [rsp+16*8],xmm14
save_reg r12, 9*16 + 0*8
save_reg r15, 9*16 + 1*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp+16*0]
movdqa xmm7, [rsp+16*1]
movdqa xmm8, [rsp+16*2]
movdqa xmm9, [rsp+16*3]
movdqa xmm10, [rsp+16*4]
movdqa xmm11, [rsp+16*5]
movdqa xmm12, [rsp+16*6]
movdqa xmm13, [rsp+16*7]
movdqa xmm14, [rsp+16*8]
mov r12, [rsp + 9*16 + 0*8]
mov r15, [rsp + 9*16 + 1*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define return rax
%define return.w eax
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 tmp2
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR movdqu
%define XSTR movdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR movdqa
%define XSTR movdqa
%else
%define XLDR movntdqa
%define XSTR movntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm14
%define xgft1_lo xmm13
%define xgft1_hi xmm12
%define xgft2_lo xmm11
%define xgft2_hi xmm10
%define x0 xmm0
%define xtmpa xmm1
%define xtmph1 xmm2
%define xtmpl1 xmm3
%define xtmph2 xmm4
%define xtmpl2 xmm5
%define xd1 xmm6
%define xd2 xmm7
%define xtmpd1 xmm8
%define xtmpd2 xmm9
align 16
global gf_2vect_mad_sse:function
func(gf_2vect_mad_sse)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
sal vec_i, 5 ;Multiply by 32
sal vec, 5
lea tmp, [mul_array + vec_i]
movdqu xgft1_lo,[tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
movdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
movdqu xgft2_hi, [tmp+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
mov dest2, [dest1+PS]
mov dest1, [dest1]
XLDR xtmpd1, [dest1+len] ;backup the last 16 bytes in dest
XLDR xtmpd2, [dest2+len] ;backup the last 16 bytes in dest
.loop16:
XLDR xd1, [dest1+pos] ;Get next dest vector
XLDR xd2, [dest2+pos] ;Get next dest vector
.loop16_overlap:
XLDR x0, [src+pos] ;Get next source vector
movdqa xtmph1, xgft1_hi ;Reload const array registers
movdqa xtmpl1, xgft1_lo
movdqa xtmph2, xgft2_hi ;Reload const array registers
movdqa xtmpl2, xgft2_lo
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
pshufb xtmph1, x0 ;Lookup mul table of high nibble
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
pxor xtmph1, xtmpl1 ;GF add high and low partials
pxor xd1, xtmph1
pshufb xtmph2, x0 ;Lookup mul table of high nibble
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
pxor xtmph2, xtmpl2 ;GF add high and low partials
pxor xd2, xtmph2
XSTR [dest1+pos], xd1 ;Store result
XSTR [dest2+pos], xd2 ;Store result
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
movdqa xd1, xtmpd1 ;Restore xd1
movdqa xd2, xtmpd2 ;Restore xd2
jmp .loop16_overlap ;Do one more overlap pass
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 16
mask0f:
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_2vect_mad_sse, 00, 01, 0203

View File

@ -0,0 +1,377 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_3vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
%endmacro
%macro FUNC_RESTORE 0
pop r13
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define stack_size 6*16 + 5*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
save_xmm128 xmm6, 0*16
save_xmm128 xmm7, 1*16
save_xmm128 xmm8, 2*16
save_xmm128 xmm9, 3*16
save_xmm128 xmm10, 4*16
save_xmm128 xmm11, 5*16
save_reg r12, 6*16 + 0*8
save_reg r13, 6*16 + 1*8
save_reg r14, 6*16 + 2*8
save_reg r15, 6*16 + 3*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp + 0*16]
vmovdqa xmm7, [rsp + 1*16]
vmovdqa xmm8, [rsp + 2*16]
vmovdqa xmm9, [rsp + 3*16]
vmovdqa xmm10, [rsp + 4*16]
vmovdqa xmm11, [rsp + 5*16]
mov r12, [rsp + 6*16 + 0*8]
mov r13, [rsp + 6*16 + 1*8]
mov r14, [rsp + 6*16 + 2*8]
mov r15, [rsp + 6*16 + 3*8]
add rsp, stack_size
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, elf32
;;;================== High Address;
;;; arg4
;;; arg3
;;; arg2
;;; arg1
;;; arg0
;;; return
;;;<================= esp of caller
;;; ebp
;;;<================= ebp = esp
;;; var0
;;; var1
;;; esi
;;; edi
;;; ebx
;;;<================= esp of callee
;;;
;;;================== Low Address;
%define PS 4
%define LOG_PS 2
%define func(x) x:
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
%define trans ecx
%define trans2 esi
%define arg0 trans ;trans and trans2 are for the variables in stack
%define arg0_m arg(0)
%define arg1 ebx
%define arg2 arg2_m
%define arg2_m arg(2)
%define arg3 trans
%define arg3_m arg(3)
%define arg4 trans
%define arg4_m arg(4)
%define arg5 trans2
%define tmp edx
%define tmp2 edi
%define tmp3 trans2
%define tmp3_m var(0)
%define tmp4 trans2
%define tmp4_m var(1)
%define return eax
%macro SLDR 2 ;; stack load/restore
mov %1, %2
%endmacro
%define SSTR SLDR
%macro FUNC_SAVE 0
push ebp
mov ebp, esp
sub esp, PS*2 ;2 local variables
push esi
push edi
push ebx
mov arg1, arg(1)
%endmacro
%macro FUNC_RESTORE 0
pop ebx
pop edi
pop esi
add esp, PS*2 ;2 local variables
pop ebp
%endmacro
%endif ; output formats
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest1 arg4
%define ptr arg5
%define vec_i tmp2
%define dest2 tmp3
%define dest3 tmp4
%define pos return
%ifidn PS,4 ;32-bit code
%define len_m arg0_m
%define src_m arg3_m
%define dest1_m arg4_m
%define dest2_m tmp3_m
%define dest3_m tmp4_m
%endif
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
%ifidn PS,8 ; 64-bit code
default rel
[bits 64]
%endif
section .text
%ifidn PS,8 ;64-bit code
%define xmask0f xmm11
%define xgft1_lo xmm10
%define xgft1_hi xmm9
%define xgft2_lo xmm8
%define xgft2_hi xmm7
%define xgft3_lo xmm6
%define xgft3_hi xmm5
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm2
%define xp2 xmm3
%define xp3 xmm4
%else
%define xmask0f xmm7
%define xgft1_lo xmm6
%define xgft1_hi xmm5
%define xgft2_lo xgft1_lo
%define xgft2_hi xgft1_hi
%define xgft3_lo xgft1_lo
%define xgft3_hi xgft1_hi
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm2
%define xp2 xmm3
%define xp3 xmm4
%endif
align 16
global gf_3vect_dot_prod_avx:function
func(gf_3vect_dot_prod_avx)
FUNC_SAVE
SLDR len, len_m
sub len, 16
SSTR len_m, len
jl .return_fail
xor pos, pos
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
SLDR dest1, dest1_m
mov dest2, [dest1+PS]
SSTR dest2_m, dest2
mov dest3, [dest1+2*PS]
SSTR dest3_m, dest3
mov dest1, [dest1]
SSTR dest1_m, dest1
.loop16:
vpxor xp1, xp1
vpxor xp2, xp2
vpxor xp3, xp3
mov tmp, mul_array
xor vec_i, vec_i
.next_vect:
SLDR src, src_m
mov ptr, [src+vec_i]
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
%ifidn PS,8 ; 64-bit code
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
vmovdqu xgft3_hi, [tmp+vec*(64/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
add tmp, 32
add vec_i, PS
%endif
XLDR x0, [ptr+pos] ;Get next source vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp1, xgft1_hi ;xp1 += partial
%ifidn PS,4 ; 32-bit code
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
%endif
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp2, xgft2_hi ;xp2 += partial
%ifidn PS,4 ; 32-bit code
sal vec, 1
vmovdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
vmovdqu xgft3_hi, [tmp+vec*(32/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
sar vec, 1
add tmp, 32
add vec_i, PS
%endif
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
vpxor xp3, xgft3_hi ;xp3 += partial
cmp vec_i, vec
jl .next_vect
SLDR dest1, dest1_m
SLDR dest2, dest2_m
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
SLDR dest3, dest3_m
XSTR [dest3+pos], xp3
SLDR len, len_m
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop16 ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_3vect_dot_prod_avx, 02, 05, 0192

View File

@ -0,0 +1,397 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_3vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
%endmacro
%macro FUNC_RESTORE 0
pop r13
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define stack_size 6*16 + 5*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
vmovdqa [rsp + 0*16], xmm6
vmovdqa [rsp + 1*16], xmm7
vmovdqa [rsp + 2*16], xmm8
vmovdqa [rsp + 3*16], xmm9
vmovdqa [rsp + 4*16], xmm10
vmovdqa [rsp + 5*16], xmm11
save_reg r12, 6*16 + 0*8
save_reg r13, 6*16 + 1*8
save_reg r14, 6*16 + 2*8
save_reg r15, 6*16 + 3*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp + 0*16]
vmovdqa xmm7, [rsp + 1*16]
vmovdqa xmm8, [rsp + 2*16]
vmovdqa xmm9, [rsp + 3*16]
vmovdqa xmm10, [rsp + 4*16]
vmovdqa xmm11, [rsp + 5*16]
mov r12, [rsp + 6*16 + 0*8]
mov r13, [rsp + 6*16 + 1*8]
mov r14, [rsp + 6*16 + 2*8]
mov r15, [rsp + 6*16 + 3*8]
add rsp, stack_size
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, elf32
;;;================== High Address;
;;; arg4
;;; arg3
;;; arg2
;;; arg1
;;; arg0
;;; return
;;;<================= esp of caller
;;; ebp
;;;<================= ebp = esp
;;; var0
;;; var1
;;; esi
;;; edi
;;; ebx
;;;<================= esp of callee
;;;
;;;================== Low Address;
%define PS 4
%define LOG_PS 2
%define func(x) x:
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
%define trans ecx
%define trans2 esi
%define arg0 trans ;trans and trans2 are for the variables in stack
%define arg0_m arg(0)
%define arg1 ebx
%define arg2 arg2_m
%define arg2_m arg(2)
%define arg3 trans
%define arg3_m arg(3)
%define arg4 trans
%define arg4_m arg(4)
%define arg5 trans2
%define tmp edx
%define tmp.w edx
%define tmp.b dl
%define tmp2 edi
%define tmp3 trans2
%define tmp3_m var(0)
%define tmp4 trans2
%define tmp4_m var(1)
%define return eax
%macro SLDR 2 ;stack load/restore
mov %1, %2
%endmacro
%define SSTR SLDR
%macro FUNC_SAVE 0
push ebp
mov ebp, esp
sub esp, PS*2 ;2 local variables
push esi
push edi
push ebx
mov arg1, arg(1)
%endmacro
%macro FUNC_RESTORE 0
pop ebx
pop edi
pop esi
add esp, PS*2 ;2 local variables
pop ebp
%endmacro
%endif ; output formats
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest1 arg4
%define ptr arg5
%define vec_i tmp2
%define dest2 tmp3
%define dest3 tmp4
%define pos return
%ifidn PS,4 ;32-bit code
%define len_m arg0_m
%define src_m arg3_m
%define dest1_m arg4_m
%define dest2_m tmp3_m
%define dest3_m tmp4_m
%endif
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
%ifidn PS,8 ;64-bit code
default rel
[bits 64]
%endif
section .text
%ifidn PS,8 ;64-bit code
%define xmask0f ymm11
%define xmask0fx xmm11
%define xgft1_lo ymm10
%define xgft1_hi ymm9
%define xgft2_lo ymm8
%define xgft2_hi ymm7
%define xgft3_lo ymm6
%define xgft3_hi ymm5
%define x0 ymm0
%define xtmpa ymm1
%define xp1 ymm2
%define xp2 ymm3
%define xp3 ymm4
%else
%define xmask0f ymm7
%define xmask0fx xmm7
%define xgft1_lo ymm6
%define xgft1_hi ymm5
%define xgft2_lo xgft1_lo
%define xgft2_hi xgft1_hi
%define xgft3_lo xgft1_lo
%define xgft3_hi xgft1_hi
%define x0 ymm0
%define xtmpa ymm1
%define xp1 ymm2
%define xp2 ymm3
%define xp3 ymm4
%endif
align 16
global gf_3vect_dot_prod_avx2:function
func(gf_3vect_dot_prod_avx2)
FUNC_SAVE
SLDR len, len_m
sub len, 32
SSTR len_m, len
jl .return_fail
xor pos, pos
mov tmp.b, 0x0f
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
SLDR dest1, dest1_m
mov dest2, [dest1+PS]
SSTR dest2_m, dest2
mov dest3, [dest1+2*PS]
SSTR dest3_m, dest3
mov dest1, [dest1]
SSTR dest1_m, dest1
.loop32:
vpxor xp1, xp1
vpxor xp2, xp2
vpxor xp3, xp3
mov tmp, mul_array
xor vec_i, vec_i
.next_vect:
SLDR src, src_m
mov ptr, [src+vec_i]
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
; " Ax{00}, Ax{10}, ..., Ax{f0}
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
%ifidn PS,8 ; 64-bit code
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
; " Cx{00}, Cx{10}, ..., Cx{f0}
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
add tmp, 32
add vec_i, PS
%endif
XLDR x0, [ptr+pos] ;Get next source vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp1, xgft1_hi ;xp1 += partial
%ifidn PS,4 ; 32-bit code
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
%endif
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp2, xgft2_hi ;xp2 += partial
%ifidn PS,4 ; 32-bit code
sal vec, 1
vmovdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
; " Cx{00}, Cx{10}, ..., Cx{f0}
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
sar vec, 1
add tmp, 32
add vec_i, PS
%endif
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
vpxor xp3, xgft3_hi ;xp3 += partial
cmp vec_i, vec
jl .next_vect
SLDR dest1, dest1_m
SLDR dest2, dest2_m
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
SLDR dest3, dest3_m
XSTR [dest3+pos], xp3
SLDR len, len_m
add pos, 32 ;Loop on 32 bytes at a time
cmp pos, len
jle .loop32
lea tmp, [len + 32]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop32 ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
;;; func core, ver, snum
slversion gf_3vect_dot_prod_avx2, 04, 05, 0197

View File

@ -0,0 +1,378 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_3vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
%endmacro
%macro FUNC_RESTORE 0
pop r13
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define stack_size 6*16 + 5*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
save_xmm128 xmm6, 0*16
save_xmm128 xmm7, 1*16
save_xmm128 xmm8, 2*16
save_xmm128 xmm9, 3*16
save_xmm128 xmm10, 4*16
save_xmm128 xmm11, 5*16
save_reg r12, 6*16 + 0*8
save_reg r13, 6*16 + 1*8
save_reg r14, 6*16 + 2*8
save_reg r15, 6*16 + 3*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp + 0*16]
movdqa xmm7, [rsp + 1*16]
movdqa xmm8, [rsp + 2*16]
movdqa xmm9, [rsp + 3*16]
movdqa xmm10, [rsp + 4*16]
movdqa xmm11, [rsp + 5*16]
mov r12, [rsp + 6*16 + 0*8]
mov r13, [rsp + 6*16 + 1*8]
mov r14, [rsp + 6*16 + 2*8]
mov r15, [rsp + 6*16 + 3*8]
add rsp, stack_size
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, elf32
;;;================== High Address;
;;; arg4
;;; arg3
;;; arg2
;;; arg1
;;; arg0
;;; return
;;;<================= esp of caller
;;; ebp
;;;<================= ebp = esp
;;; var0
;;; var1
;;; esi
;;; edi
;;; ebx
;;;<================= esp of callee
;;;
;;;================== Low Address;
%define PS 4
%define LOG_PS 2
%define func(x) x:
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
%define trans ecx
%define trans2 esi
%define arg0 trans ;trans and trans2 are for the variables in stack
%define arg0_m arg(0)
%define arg1 ebx
%define arg2 arg2_m
%define arg2_m arg(2)
%define arg3 trans
%define arg3_m arg(3)
%define arg4 trans
%define arg4_m arg(4)
%define arg5 trans2
%define tmp edx
%define tmp2 edi
%define tmp3 trans2
%define tmp3_m var(0)
%define tmp4 trans2
%define tmp4_m var(1)
%define return eax
%macro SLDR 2 ;; stack load/restore
mov %1, %2
%endmacro
%define SSTR SLDR
%macro FUNC_SAVE 0
push ebp
mov ebp, esp
sub esp, PS*2 ;2 local variables
push esi
push edi
push ebx
mov arg1, arg(1)
%endmacro
%macro FUNC_RESTORE 0
pop ebx
pop edi
pop esi
add esp, PS*2 ;2 local variables
pop ebp
%endmacro
%endif ; output formats
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest1 arg4
%define ptr arg5
%define vec_i tmp2
%define dest2 tmp3
%define dest3 tmp4
%define pos return
%ifidn PS,4 ;32-bit code
%define len_m arg0_m
%define src_m arg3_m
%define dest1_m arg4_m
%define dest2_m tmp3_m
%define dest3_m tmp4_m
%endif
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR movdqu
%define XSTR movdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR movdqa
%define XSTR movdqa
%else
%define XLDR movntdqa
%define XSTR movntdq
%endif
%endif
%ifidn PS,8 ; 64-bit code
default rel
[bits 64]
%endif
section .text
%ifidn PS,8 ;64-bit code
%define xmask0f xmm11
%define xgft1_lo xmm2
%define xgft1_hi xmm3
%define xgft2_lo xmm4
%define xgft2_hi xmm7
%define xgft3_lo xmm6
%define xgft3_hi xmm5
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm10
%define xp2 xmm9
%define xp3 xmm8
%else
%define xmask0f xmm7
%define xgft1_lo xmm6
%define xgft1_hi xmm5
%define xgft2_lo xgft1_lo
%define xgft2_hi xgft1_hi
%define xgft3_lo xgft1_lo
%define xgft3_hi xgft1_hi
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm2
%define xp2 xmm3
%define xp3 xmm4
%endif
align 16
global gf_3vect_dot_prod_sse:function
func(gf_3vect_dot_prod_sse)
FUNC_SAVE
SLDR len, len_m
sub len, 16
SSTR len_m, len
jl .return_fail
xor pos, pos
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
SLDR dest1, dest1_m
mov dest2, [dest1+PS]
SSTR dest2_m, dest2
mov dest3, [dest1+2*PS]
SSTR dest3_m, dest3
mov dest1, [dest1]
SSTR dest1_m, dest1
.loop16:
pxor xp1, xp1
pxor xp2, xp2
pxor xp3, xp3
mov tmp, mul_array
xor vec_i, vec_i
.next_vect:
SLDR src, src_m
mov ptr, [src+vec_i]
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
%ifidn PS,8 ;64-bit code
movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
movdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
movdqu xgft3_hi, [tmp+vec*(64/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
add tmp, 32
add vec_i, PS
%endif
XLDR x0, [ptr+pos] ;Get next source vector
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
pxor xp1, xgft1_hi ;xp1 += partial
%ifidn PS,4 ;32-bit code
movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
%endif
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
pxor xp2, xgft2_hi ;xp2 += partial
%ifidn PS,4 ;32-bit code
sal vec, 1
movdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
movdqu xgft3_hi, [tmp+vec*(32/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
sar vec, 1
add tmp, 32
add vec_i, PS
%endif
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft3_hi, xgft3_lo ;GF add high and low partials
pxor xp3, xgft3_hi ;xp3 += partial
cmp vec_i, vec
jl .next_vect
SLDR dest1, dest1_m
SLDR dest2, dest2_m
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
SLDR dest3, dest3_m
XSTR [dest3+pos], xp3
SLDR len, len_m
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop16 ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_3vect_dot_prod_sse, 00, 06, 0063

View File

@ -0,0 +1,246 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "test.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_3vect_dot_prod_sse
#endif
#define str(s) #s
#define xstr(s) str(s)
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 10
# define TEST_LEN 8*1024
# define TEST_LOOPS 40000
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 10
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
# define TEST_LOOPS 100
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS 1000
# endif
# endif
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j;
void *buf;
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
u8 g_tbls[3 * TEST_SOURCES * 32], *dest_ptrs[3], *buffs[TEST_SOURCES];
u8 *dest1, *dest2, *dest3, *dest_ref1, *dest_ref2, *dest_ref3;
struct perf start, stop;
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref3 = buf;
dest_ptrs[0] = dest1;
dest_ptrs[1] = dest2;
dest_ptrs[2] = dest3;
// Performance test
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
memset(dest1, 0, TEST_LEN);
memset(dest2, 0, TEST_LEN);
memset(dest_ref1, 0, TEST_LEN);
memset(dest_ref2, 0, TEST_LEN);
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
}
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
dest_ref3);
#ifdef DO_REF_PERF
perf_start(&start);
for (i = 0; i < TEST_LOOPS / 100; i++) {
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
buffs, dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
buffs, dest_ref3);
}
perf_stop(&stop);
printf("gf_3vect_dot_prod_base" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 3) * i);
#endif
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
}
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 3) * i);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
printf("pass perf check\n");
return 0;
}

View File

@ -0,0 +1,583 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_3vect_dot_prod_sse
#endif
#ifndef TEST_MIN_SIZE
# define TEST_MIN_SIZE 16
#endif
#define str(s) #s
#define xstr(s) str(s)
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#define TEST_MEM TEST_SIZE
#define TEST_LOOPS 10000
#define TEST_TYPE_STR ""
#ifndef TEST_SOURCES
# define TEST_SOURCES 16
#endif
#ifndef RANDOMS
# define RANDOMS 20
#endif
#ifdef EC_ALIGNED_ADDR
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 0
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
#else
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 32
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j, rtest, srcs;
void *buf;
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
u8 g_tbls[3 * TEST_SOURCES * 32], *dest_ptrs[3], *buffs[TEST_SOURCES];
u8 *dest1, *dest2, *dest3, *dest_ref1, *dest_ref2, *dest_ref3;
int align, size;
unsigned char *efence_buffs[TEST_SOURCES];
unsigned int offset;
u8 *ubuffs[TEST_SOURCES];
u8 *udest_ptrs[3];
printf(xstr(FUNCTION_UNDER_TEST) "_test: %dx%d ", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");;
return -1;
}
dest_ref2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref3 = buf;
dest_ptrs[0] = dest1;
dest_ptrs[1] = dest2;
dest_ptrs[2] = dest3;
// Test of all zeros
for (i = 0; i < TEST_SOURCES; i++)
memset(buffs[i], 0, TEST_LEN);
memset(dest1, 0, TEST_LEN);
memset(dest2, 0, TEST_LEN);
memset(dest3, 0, TEST_LEN);
memset(dest_ref1, 0, TEST_LEN);
memset(dest_ref2, 0, TEST_LEN);
memset(dest_ref3, 0, TEST_LEN);
memset(g1, 2, TEST_SOURCES);
memset(g2, 1, TEST_SOURCES);
memset(g3, 7, TEST_SOURCES);
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
dest_ref3);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail zero" xstr(FUNCTION_UNDER_TEST) " test1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
putchar('.');
// Rand data test
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
}
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
buffs, dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
buffs, dest_ref3);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
putchar('.');
}
// Rand data test with varied parameters
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
for (i = 0; i < srcs; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
dest_ref3);
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test1 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test2 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test3 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
putchar('.');
}
}
// Run tests at end of buffer for Electric Fence
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
efence_buffs[i] = buffs[i] + TEST_LEN - size;
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
}
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
}
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
efence_buffs, dest_ref2);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
efence_buffs, dest_ref3);
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, align);
printf("dprod_dut:");
dump(dest1, align);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, align);
printf("dprod_dut:");
dump(dest2, align);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, align);
printf("dprod_dut:");
dump(dest3, align);
return -1;
}
putchar('.');
}
// Test rand ptr alignment if available
for (rtest = 0; rtest < RANDOMS; rtest++) {
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
srcs = rand() % TEST_SOURCES;
if (srcs == 0)
continue;
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
// Add random offsets
for (i = 0; i < srcs; i++)
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
memset(dest1, 0, TEST_LEN); // zero pad to check write-over
memset(dest2, 0, TEST_LEN);
memset(dest3, 0, TEST_LEN);
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
ubuffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
if (memcmp(dest_ref1, udest_ptrs[0], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(udest_ptrs[0], 25);
return -1;
}
if (memcmp(dest_ref2, udest_ptrs[1], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(udest_ptrs[1], 25);
return -1;
}
if (memcmp(dest_ref3, udest_ptrs[2], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(udest_ptrs[2], 25);
return -1;
}
// Confirm that padding around dests is unchanged
memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
offset = udest_ptrs[0] - dest1;
if (memcmp(dest1, dest_ref1, offset)) {
printf("Fail rand ualign pad1 start\n");
return -1;
}
if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad1 end\n");
return -1;
}
offset = udest_ptrs[1] - dest2;
if (memcmp(dest2, dest_ref1, offset)) {
printf("Fail rand ualign pad2 start\n");
return -1;
}
if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad2 end\n");
return -1;
}
offset = udest_ptrs[2] - dest3;
if (memcmp(dest3, dest_ref1, offset)) {
printf("Fail rand ualign pad3 start\n");
return -1;
}
if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad3 end\n");;
return -1;
}
putchar('.');
}
// Test all size alignment
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
srcs = TEST_SOURCES;
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
if (memcmp(dest_ref1, dest_ptrs[0], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest_ptrs[0], 25);
return -1;
}
if (memcmp(dest_ref2, dest_ptrs[1], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest_ptrs[1], 25);
return -1;
}
if (memcmp(dest_ref3, dest_ptrs[2], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest_ptrs[2], 25);
return -1;
}
}
printf("Pass\n");
return 0;
}

View File

@ -0,0 +1,288 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define return rax
%define return.w eax
%define stack_size 16*10 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
vmovdqa [rsp+16*0],xmm6
vmovdqa [rsp+16*1],xmm7
vmovdqa [rsp+16*2],xmm8
vmovdqa [rsp+16*3],xmm9
vmovdqa [rsp+16*4],xmm10
vmovdqa [rsp+16*5],xmm11
vmovdqa [rsp+16*6],xmm12
vmovdqa [rsp+16*7],xmm13
vmovdqa [rsp+16*8],xmm14
vmovdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8
save_reg r15, 10*16 + 1*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp+16*0]
vmovdqa xmm7, [rsp+16*1]
vmovdqa xmm8, [rsp+16*2]
vmovdqa xmm9, [rsp+16*3]
vmovdqa xmm10, [rsp+16*4]
vmovdqa xmm11, [rsp+16*5]
vmovdqa xmm12, [rsp+16*6]
vmovdqa xmm13, [rsp+16*7]
vmovdqa xmm14, [rsp+16*8]
vmovdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8]
mov r15, [rsp + 10*16 + 1*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define return rax
%define return.w eax
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 mul_array
%define dest3 vec_i
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft1_lo xmm14
%define xgft1_hi xmm13
%define xgft2_lo xmm12
%define xgft2_hi xmm11
%define xgft3_lo xmm10
%define xgft3_hi xmm9
%define x0 xmm0
%define xtmpa xmm1
%define xtmph1 xmm2
%define xtmpl1 xmm3
%define xtmph2 xmm4
%define xtmpl2 xmm5
%define xtmph3 xmm6
%define xtmpl3 xmm7
%define xd1 xmm8
%define xd2 xtmpl1
%define xd3 xtmph1
align 16
global gf_3vect_mad_avx:function
func(gf_3vect_mad_avx)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
sal vec_i, 5 ;Multiply by 32
sal vec, 5
lea tmp, [mul_array + vec_i]
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
vmovdqu xgft2_hi, [tmp+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
vmovdqu xgft3_hi, [tmp+2*vec+16]; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
mov dest2, [dest1+PS] ; reuse mul_array
mov dest3, [dest1+2*PS] ; reuse vec_i
mov dest1, [dest1]
.loop16:
XLDR x0, [src+pos] ;Get next source vector
XLDR xd1, [dest1+pos] ;Get next dest vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
; dest1
vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
vpxor xd1, xd1, xtmph1 ;xd1 += partial
XLDR xd2, [dest2+pos] ;reuse xtmpl1. Get next dest vector
XLDR xd3, [dest3+pos] ;reuse xtmph1. Get next dest vector
; dest2
vpshufb xtmph2, xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl2, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
vpxor xd2, xd2, xtmph2 ;xd2 += partial
; dest3
vpshufb xtmph3, xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl3, xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials
vpxor xd3, xd3, xtmph3 ;xd3 += partial
XSTR [dest1+pos], xd1
XSTR [dest2+pos], xd2
XSTR [dest3+pos], xd3
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
.lessthan16:
;; Tail len
;; Do one more overlap pass
mov tmp, len ;Overlapped offset length-16
XLDR x0, [src+tmp] ;Get next source vector
XLDR xd1, [dest1+tmp] ;Get next dest vector
XLDR xd2, [dest2+tmp] ;reuse xtmpl1. Get next dest vector
XLDR xd3, [dest3+tmp] ;reuse xtmph1. Get next dest vector
sub len, pos
movdqa xtmph3, [constip16] ;Load const of i + 16
vpinsrb xtmpl3, xtmpl3, len.w, 15
vpshufb xtmpl3, xtmpl3, xmask0f ;Broadcast len to all bytes
vpcmpgtb xtmpl3, xtmpl3, xtmph3
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
; dest1
vpshufb xgft1_hi, xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_hi, xgft1_lo ;GF add high and low partials
vpand xgft1_hi, xgft1_hi, xtmpl3
vpxor xd1, xd1, xgft1_hi
; dest2
vpshufb xgft2_hi, xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_hi, xgft2_lo ;GF add high and low partials
vpand xgft2_hi, xgft2_hi, xtmpl3
vpxor xd2, xd2, xgft2_hi
; dest3
vpshufb xgft3_hi, xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_hi, xgft3_lo ;GF add high and low partials
vpand xgft3_hi, xgft3_hi, xtmpl3
vpxor xd3, xd3, xgft3_hi
XSTR [dest1+tmp], xd1
XSTR [dest2+tmp], xd2
XSTR [dest3+tmp], xd3
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
constip16:
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
;;; func core, ver, snum
slversion gf_3vect_mad_avx, 02, 01, 0207

View File

@ -0,0 +1,317 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_3vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define return rax
%define return.w eax
%define stack_size 16*10 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
vmovdqa [rsp+16*0],xmm6
vmovdqa [rsp+16*1],xmm7
vmovdqa [rsp+16*2],xmm8
vmovdqa [rsp+16*3],xmm9
vmovdqa [rsp+16*4],xmm10
vmovdqa [rsp+16*5],xmm11
vmovdqa [rsp+16*6],xmm12
vmovdqa [rsp+16*7],xmm13
vmovdqa [rsp+16*8],xmm14
vmovdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8
save_reg r15, 10*16 + 1*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp+16*0]
vmovdqa xmm7, [rsp+16*1]
vmovdqa xmm8, [rsp+16*2]
vmovdqa xmm9, [rsp+16*3]
vmovdqa xmm10, [rsp+16*4]
vmovdqa xmm11, [rsp+16*5]
vmovdqa xmm12, [rsp+16*6]
vmovdqa xmm13, [rsp+16*7]
vmovdqa xmm14, [rsp+16*8]
vmovdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8]
mov r15, [rsp + 10*16 + 1*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define return rax
%define return.w eax
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
;;; gf_3vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 mul_array
%define dest3 vec_i
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f ymm15
%define xmask0fx xmm15
%define xgft1_lo ymm14
%define xgft1_hi ymm13
%define xgft2_lo ymm12
%define xgft3_lo ymm11
%define x0 ymm0
%define xtmpa ymm1
%define xtmph1 ymm2
%define xtmpl1 ymm3
%define xtmph2 ymm4
%define xtmpl2 ymm5
%define xtmpl2x xmm5
%define xtmph3 ymm6
%define xtmpl3 ymm7
%define xtmpl3x xmm7
%define xd1 ymm8
%define xd2 ymm9
%define xd3 ymm10
align 16
global gf_3vect_mad_avx2:function
func(gf_3vect_mad_avx2)
FUNC_SAVE
sub len, 32
jl .return_fail
xor pos, pos
mov tmp.b, 0x0f
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
sal vec_i, 5 ;Multiply by 32
sal vec, 5
lea tmp, [mul_array + vec_i]
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
; " Ax{00}, Ax{10}, ..., Ax{f0}
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
mov dest2, [dest1+PS] ; reuse mul_array
mov dest3, [dest1+2*PS] ; reuse vec_i
mov dest1, [dest1]
.loop32:
XLDR x0, [src+pos] ;Get next source vector
XLDR xd1, [dest1+pos] ;Get next dest vector
XLDR xd2, [dest2+pos] ;Get next dest vector
XLDR xd3, [dest3+pos] ;Get next dest vector
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
vperm2i128 xtmpl2, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
vperm2i128 xtmpl3, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
; dest1
vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
vpxor xd1, xd1, xtmph1 ;xd1 += partial
; dest2
vpshufb xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmpl2 ;GF add high and low partials
vpxor xd2, xtmph2 ;xd2 += partial
; dest3
vpshufb xtmph3, x0 ;Lookup mul table of high nibble
vpshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
vpxor xtmph3, xtmpl3 ;GF add high and low partials
vpxor xd3, xtmph3 ;xd3 += partial
XSTR [dest1+pos], xd1
XSTR [dest2+pos], xd2
XSTR [dest3+pos], xd3
add pos, 32 ;Loop on 32 bytes at a time
cmp pos, len
jle .loop32
lea tmp, [len + 32]
cmp pos, tmp
je .return_pass
.lessthan32:
;; Tail len
;; Do one more overlap pass
mov tmp.b, 0x1f
vpinsrb xtmpl2x, xtmpl2x, tmp.w, 0
vpbroadcastb xtmpl2, xtmpl2x ;Construct mask 0x1f1f1f...
mov tmp, len ;Overlapped offset length-32
XLDR x0, [src+tmp] ;Get next source vector
XLDR xd1, [dest1+tmp] ;Get next dest vector
XLDR xd2, [dest2+tmp] ;Get next dest vector
XLDR xd3, [dest3+tmp] ;Get next dest vector
sub len, pos
vmovdqa xtmph3, [constip32] ;Load const of i + 32
vpinsrb xtmpl3x, xtmpl3x, len.w, 15
vinserti128 xtmpl3, xtmpl3, xtmpl3x, 1 ;swapped to xtmpl3x | xtmpl3x
vpshufb xtmpl3, xtmpl3, xtmpl2 ;Broadcast len to all bytes. xtmpl2=0x1f1f1f...
vpcmpgtb xtmpl3, xtmpl3, xtmph3
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
; dest1
vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
vpand xtmph1, xtmph1, xtmpl3
vpxor xd1, xd1, xtmph1 ;xd1 += partial
; dest2
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xgft2_lo ;GF add high and low partials
vpand xtmph2, xtmph2, xtmpl3
vpxor xd2, xd2, xtmph2 ;xd2 += partial
; dest3
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph3, xtmph3, xgft3_lo ;GF add high and low partials
vpand xtmph3, xtmph3, xtmpl3
vpxor xd3, xd3, xtmph3 ;xd3 += partial
XSTR [dest1+tmp], xd1
XSTR [dest2+tmp], xd2
XSTR [dest3+tmp], xd3
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 32
constip32:
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
;;; func core, ver, snum
slversion gf_3vect_mad_avx2, 04, 01, 0208

View File

@ -0,0 +1,298 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_3vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define return rax
%define return.w eax
%define stack_size 16*10 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
movdqa [rsp+16*0],xmm6
movdqa [rsp+16*1],xmm7
movdqa [rsp+16*2],xmm8
movdqa [rsp+16*3],xmm9
movdqa [rsp+16*4],xmm10
movdqa [rsp+16*5],xmm11
movdqa [rsp+16*6],xmm12
movdqa [rsp+16*7],xmm13
movdqa [rsp+16*8],xmm14
movdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8
save_reg r15, 10*16 + 1*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp+16*0]
movdqa xmm7, [rsp+16*1]
movdqa xmm8, [rsp+16*2]
movdqa xmm9, [rsp+16*3]
movdqa xmm10, [rsp+16*4]
movdqa xmm11, [rsp+16*5]
movdqa xmm12, [rsp+16*6]
movdqa xmm13, [rsp+16*7]
movdqa xmm14, [rsp+16*8]
movdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8]
mov r15, [rsp + 10*16 + 1*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define return rax
%define return.w eax
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
;;; gf_3vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 mul_array
%define dest3 vec_i
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR movdqu
%define XSTR movdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR movdqa
%define XSTR movdqa
%else
%define XLDR movntdqa
%define XSTR movntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft1_lo xmm14
%define xgft1_hi xmm13
%define xgft2_lo xmm12
%define xgft2_hi xmm11
%define xgft3_lo xmm10
%define xgft3_hi xmm9
%define x0 xmm0
%define xtmpa xmm1
%define xtmph1 xmm2
%define xtmpl1 xmm3
%define xtmph2 xmm4
%define xtmpl2 xmm5
%define xtmph3 xmm6
%define xtmpl3 xmm7
%define xd1 xmm8
%define xd2 xtmpl1
%define xd3 xtmph1
align 16
global gf_3vect_mad_sse:function
func(gf_3vect_mad_sse)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
sal vec_i, 5 ;Multiply by 32
sal vec, 5
lea tmp, [mul_array + vec_i]
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
movdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
movdqu xgft2_hi, [tmp+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
movdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
movdqu xgft3_hi, [tmp+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
mov dest2, [dest1+PS] ; reuse mul_array
mov dest3, [dest1+2*PS] ; reuse vec_i
mov dest1, [dest1]
.loop16:
XLDR x0, [src+pos] ;Get next source vector
movdqa xtmph1, xgft1_hi ;Reload const array registers
movdqa xtmpl1, xgft1_lo
movdqa xtmph2, xgft2_hi ;Reload const array registers
movdqa xtmpl2, xgft2_lo
movdqa xtmph3, xgft3_hi ;Reload const array registers
movdqa xtmpl3, xgft3_lo
XLDR xd1, [dest1+pos] ;Get next dest vector
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
; dest1
pshufb xtmph1, x0 ;Lookup mul table of high nibble
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
pxor xtmph1, xtmpl1 ;GF add high and low partials
pxor xd1, xtmph1
XLDR xd2, [dest2+pos] ;reuse xtmpl1. Get next dest vector
XLDR xd3, [dest3+pos] ;reuse xtmph1. Get next dest vector
; dest2
pshufb xtmph2, x0 ;Lookup mul table of high nibble
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
pxor xtmph2, xtmpl2 ;GF add high and low partials
pxor xd2, xtmph2
; dest3
pshufb xtmph3, x0 ;Lookup mul table of high nibble
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
pxor xtmph3, xtmpl3 ;GF add high and low partials
pxor xd3, xtmph3
XSTR [dest1+pos], xd1 ;Store result
XSTR [dest2+pos], xd2 ;Store result
XSTR [dest3+pos], xd3 ;Store result
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
.lessthan16:
;; Tail len
;; Do one more overlap pass
mov tmp, len ;Overlapped offset length-16
XLDR x0, [src+tmp] ;Get next source vector
XLDR xd1, [dest1+tmp] ;Get next dest vector
XLDR xd2, [dest2+tmp] ;reuse xtmpl1. Get next dest vector
XLDR xd3, [dest3+tmp] ;reuse xtmph1. Get next dest vector
sub len, pos
movdqa xtmph3, [constip16] ;Load const of i + 16
pinsrb xtmpl3, len.w, 15
pshufb xtmpl3, xmask0f ;Broadcast len to all bytes
pcmpgtb xtmpl3, xtmph3
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
; dest1
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
pand xgft1_hi, xtmpl3
pxor xd1, xgft1_hi
; dest2
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
pand xgft2_hi, xtmpl3
pxor xd2, xgft2_hi
; dest3
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft3_hi, xgft3_lo ;GF add high and low partials
pand xgft3_hi, xtmpl3
pxor xd3, xgft3_hi
XSTR [dest1+tmp], xd1 ;Store result
XSTR [dest2+tmp], xd2 ;Store result
XSTR [dest3+tmp], xd3 ;Store result
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 16
mask0f:
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
constip16:
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
;;; func core, ver, snum
slversion gf_3vect_mad_sse, 00, 01, 0206

View File

@ -0,0 +1,441 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_4vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
%define tmp5 r14 ; must be saved and restored
%define tmp6 r15 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
push r14
push r15
%endmacro
%macro FUNC_RESTORE 0
pop r15
pop r14
pop r13
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define tmp5 rdi ; must be saved and restored
%define tmp6 rsi ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define stack_size 9*16 + 7*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
save_xmm128 xmm6, 0*16
save_xmm128 xmm7, 1*16
save_xmm128 xmm8, 2*16
save_xmm128 xmm9, 3*16
save_xmm128 xmm10, 4*16
save_xmm128 xmm11, 5*16
save_xmm128 xmm12, 6*16
save_xmm128 xmm13, 7*16
save_xmm128 xmm14, 8*16
save_reg r12, 9*16 + 0*8
save_reg r13, 9*16 + 1*8
save_reg r14, 9*16 + 2*8
save_reg r15, 9*16 + 3*8
save_reg rdi, 9*16 + 4*8
save_reg rsi, 9*16 + 5*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp + 0*16]
vmovdqa xmm7, [rsp + 1*16]
vmovdqa xmm8, [rsp + 2*16]
vmovdqa xmm9, [rsp + 3*16]
vmovdqa xmm10, [rsp + 4*16]
vmovdqa xmm11, [rsp + 5*16]
vmovdqa xmm12, [rsp + 6*16]
vmovdqa xmm13, [rsp + 7*16]
vmovdqa xmm14, [rsp + 8*16]
mov r12, [rsp + 9*16 + 0*8]
mov r13, [rsp + 9*16 + 1*8]
mov r14, [rsp + 9*16 + 2*8]
mov r15, [rsp + 9*16 + 3*8]
mov rdi, [rsp + 9*16 + 4*8]
mov rsi, [rsp + 9*16 + 5*8]
add rsp, stack_size
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, elf32
;;;================== High Address;
;;; arg4
;;; arg3
;;; arg2
;;; arg1
;;; arg0
;;; return
;;;<================= esp of caller
;;; ebp
;;;<================= ebp = esp
;;; var0
;;; var1
;;; var2
;;; var3
;;; esi
;;; edi
;;; ebx
;;;<================= esp of callee
;;;
;;;================== Low Address;
%define PS 4
%define LOG_PS 2
%define func(x) x:
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
%define trans ecx
%define trans2 esi
%define arg0 trans ;trans and trans2 are for the variables in stack
%define arg0_m arg(0)
%define arg1 ebx
%define arg2 arg2_m
%define arg2_m arg(2)
%define arg3 trans
%define arg3_m arg(3)
%define arg4 trans
%define arg4_m arg(4)
%define arg5 trans2
%define tmp edx
%define tmp2 edi
%define tmp3 trans2
%define tmp3_m var(0)
%define tmp4 trans2
%define tmp4_m var(1)
%define tmp5 trans2
%define tmp5_m var(2)
%define tmp6 trans2
%define tmp6_m var(3)
%define return eax
%macro SLDR 2 ;stack load/restore
mov %1, %2
%endmacro
%define SSTR SLDR
%macro FUNC_SAVE 0
push ebp
mov ebp, esp
sub esp, PS*4 ;4 local variables
push esi
push edi
push ebx
mov arg1, arg(1)
%endmacro
%macro FUNC_RESTORE 0
pop ebx
pop edi
pop esi
add esp, PS*4 ;4 local variables
pop ebp
%endmacro
%endif ; output formats
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest1 arg4
%define ptr arg5
%define vec_i tmp2
%define dest2 tmp3
%define dest3 tmp4
%define dest4 tmp5
%define vskip3 tmp6
%define pos return
%ifidn PS,4 ;32-bit code
%define len_m arg0_m
%define src_m arg3_m
%define dest1_m arg4_m
%define dest2_m tmp3_m
%define dest3_m tmp4_m
%define dest4_m tmp5_m
%define vskip3_m tmp6_m
%endif
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
%ifidn PS,8 ; 64-bit code
default rel
[bits 64]
%endif
section .text
%ifidn PS,8 ;64-bit code
%define xmask0f xmm14
%define xgft1_lo xmm13
%define xgft1_hi xmm12
%define xgft2_lo xmm11
%define xgft2_hi xmm10
%define xgft3_lo xmm9
%define xgft3_hi xmm8
%define xgft4_lo xmm7
%define xgft4_hi xmm6
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm2
%define xp2 xmm3
%define xp3 xmm4
%define xp4 xmm5
%else
%define xmm_trans xmm7 ;reuse xmask0f and xgft1_lo
%define xmask0f xmm_trans
%define xgft1_lo xmm_trans
%define xgft1_hi xmm6
%define xgft2_lo xgft1_lo
%define xgft2_hi xgft1_hi
%define xgft3_lo xgft1_lo
%define xgft3_hi xgft1_hi
%define xgft4_lo xgft1_lo
%define xgft4_hi xgft1_hi
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm2
%define xp2 xmm3
%define xp3 xmm4
%define xp4 xmm5
%endif
align 16
global gf_4vect_dot_prod_avx:function
func(gf_4vect_dot_prod_avx)
FUNC_SAVE
SLDR len, len_m
sub len, 16
SSTR len_m, len
jl .return_fail
xor pos, pos
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
mov vskip3, vec
imul vskip3, 96
SSTR vskip3_m, vskip3
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
SLDR dest1, dest1_m
mov dest2, [dest1+PS]
SSTR dest2_m, dest2
mov dest3, [dest1+2*PS]
SSTR dest3_m, dest3
mov dest4, [dest1+3*PS]
SSTR dest4_m, dest4
mov dest1, [dest1]
SSTR dest1_m, dest1
.loop16:
vpxor xp1, xp1
vpxor xp2, xp2
vpxor xp3, xp3
vpxor xp4, xp4
mov tmp, mul_array
xor vec_i, vec_i
.next_vect:
SLDR src, src_m
mov ptr, [src+vec_i]
%ifidn PS,8 ;64-bit code
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
vmovdqu xgft3_hi, [tmp+vec*(64/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
vmovdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
XLDR x0, [ptr+pos] ;Get next source vector
add tmp, 32
add vec_i, PS
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
%else ;32-bit code
XLDR x0, [ptr+pos] ;Get next source vector
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
%endif
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp1, xgft1_hi ;xp1 += partial
%ifidn PS,4 ;32-bit code
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
%endif
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp2, xgft2_hi ;xp2 += partial
%ifidn PS,4 ;32-bit code
sal vec, 1
vmovdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
vmovdqu xgft3_hi, [tmp+vec*(32/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
sar vec, 1
%endif
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
vpxor xp3, xgft3_hi ;xp3 += partial
%ifidn PS,4 ;32-bit code
SLDR vskip3, vskip3_m
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
vmovdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
add tmp, 32
add vec_i, PS
%endif
vpshufb xgft4_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft4_hi, xgft4_lo ;GF add high and low partials
vpxor xp4, xgft4_hi ;xp4 += partial
cmp vec_i, vec
jl .next_vect
SLDR dest1, dest1_m
SLDR dest2, dest2_m
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
SLDR dest3, dest3_m
XSTR [dest3+pos], xp3
SLDR dest4, dest4_m
XSTR [dest4+pos], xp4
SLDR len, len_m
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop16 ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_4vect_dot_prod_avx, 02, 05, 0193

View File

@ -0,0 +1,460 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_4vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
%define tmp5 r14 ; must be saved and restored
%define tmp6 r15 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
push r14
push r15
%endmacro
%macro FUNC_RESTORE 0
pop r15
pop r14
pop r13
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define tmp5 rdi ; must be saved and restored
%define tmp6 rsi ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define stack_size 9*16 + 7*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
vmovdqa [rsp + 0*16], xmm6
vmovdqa [rsp + 1*16], xmm7
vmovdqa [rsp + 2*16], xmm8
vmovdqa [rsp + 3*16], xmm9
vmovdqa [rsp + 4*16], xmm10
vmovdqa [rsp + 5*16], xmm11
vmovdqa [rsp + 6*16], xmm12
vmovdqa [rsp + 7*16], xmm13
vmovdqa [rsp + 8*16], xmm14
save_reg r12, 9*16 + 0*8
save_reg r13, 9*16 + 1*8
save_reg r14, 9*16 + 2*8
save_reg r15, 9*16 + 3*8
save_reg rdi, 9*16 + 4*8
save_reg rsi, 9*16 + 5*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp + 0*16]
vmovdqa xmm7, [rsp + 1*16]
vmovdqa xmm8, [rsp + 2*16]
vmovdqa xmm9, [rsp + 3*16]
vmovdqa xmm10, [rsp + 4*16]
vmovdqa xmm11, [rsp + 5*16]
vmovdqa xmm12, [rsp + 6*16]
vmovdqa xmm13, [rsp + 7*16]
vmovdqa xmm14, [rsp + 8*16]
mov r12, [rsp + 9*16 + 0*8]
mov r13, [rsp + 9*16 + 1*8]
mov r14, [rsp + 9*16 + 2*8]
mov r15, [rsp + 9*16 + 3*8]
mov rdi, [rsp + 9*16 + 4*8]
mov rsi, [rsp + 9*16 + 5*8]
add rsp, stack_size
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, elf32
;;;================== High Address;
;;; arg4
;;; arg3
;;; arg2
;;; arg1
;;; arg0
;;; return
;;;<================= esp of caller
;;; ebp
;;;<================= ebp = esp
;;; var0
;;; var1
;;; var2
;;; var3
;;; esi
;;; edi
;;; ebx
;;;<================= esp of callee
;;;
;;;================== Low Address;
%define PS 4
%define LOG_PS 2
%define func(x) x:
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
%define trans ecx
%define trans2 esi
%define arg0 trans ;trans and trans2 are for the variables in stack
%define arg0_m arg(0)
%define arg1 ebx
%define arg2 arg2_m
%define arg2_m arg(2)
%define arg3 trans
%define arg3_m arg(3)
%define arg4 trans
%define arg4_m arg(4)
%define arg5 trans2
%define tmp edx
%define tmp.w edx
%define tmp.b dl
%define tmp2 edi
%define tmp3 trans2
%define tmp3_m var(0)
%define tmp4 trans2
%define tmp4_m var(1)
%define tmp5 trans2
%define tmp5_m var(2)
%define tmp6 trans2
%define tmp6_m var(3)
%define return eax
%macro SLDR 2 ;stack load/restore
mov %1, %2
%endmacro
%define SSTR SLDR
%macro FUNC_SAVE 0
push ebp
mov ebp, esp
sub esp, PS*4 ;4 local variables
push esi
push edi
push ebx
mov arg1, arg(1)
%endmacro
%macro FUNC_RESTORE 0
pop ebx
pop edi
pop esi
add esp, PS*4 ;4 local variables
pop ebp
%endmacro
%endif ; output formats
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest1 arg4
%define ptr arg5
%define vec_i tmp2
%define dest2 tmp3
%define dest3 tmp4
%define dest4 tmp5
%define vskip3 tmp6
%define pos return
%ifidn PS,4 ;32-bit code
%define len_m arg0_m
%define src_m arg3_m
%define dest1_m arg4_m
%define dest2_m tmp3_m
%define dest3_m tmp4_m
%define dest4_m tmp5_m
%define vskip3_m tmp6_m
%endif
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
%ifidn PS,8 ;64-bit code
default rel
[bits 64]
%endif
section .text
%ifidn PS,8 ;64-bit code
%define xmask0f ymm14
%define xmask0fx xmm14
%define xgft1_lo ymm13
%define xgft1_hi ymm12
%define xgft2_lo ymm11
%define xgft2_hi ymm10
%define xgft3_lo ymm9
%define xgft3_hi ymm8
%define xgft4_lo ymm7
%define xgft4_hi ymm6
%define x0 ymm0
%define xtmpa ymm1
%define xp1 ymm2
%define xp2 ymm3
%define xp3 ymm4
%define xp4 ymm5
%else
%define ymm_trans ymm7 ;reuse xmask0f and xgft1_hi
%define xmask0f ymm_trans
%define xmask0fx xmm7
%define xgft1_lo ymm6
%define xgft1_hi ymm_trans
%define xgft2_lo xgft1_lo
%define xgft2_hi xgft1_hi
%define xgft3_lo xgft1_lo
%define xgft3_hi xgft1_hi
%define xgft4_lo xgft1_lo
%define xgft4_hi xgft1_hi
%define x0 ymm0
%define xtmpa ymm1
%define xp1 ymm2
%define xp2 ymm3
%define xp3 ymm4
%define xp4 ymm5
%endif
align 16
global gf_4vect_dot_prod_avx2:function
func(gf_4vect_dot_prod_avx2)
FUNC_SAVE
SLDR len, len_m
sub len, 32
SSTR len_m, len
jl .return_fail
xor pos, pos
mov tmp.b, 0x0f
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
mov vskip3, vec
imul vskip3, 96
SSTR vskip3_m, vskip3
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
SLDR dest1, dest1_m
mov dest2, [dest1+PS]
SSTR dest2_m, dest2
mov dest3, [dest1+2*PS]
SSTR dest3_m, dest3
mov dest4, [dest1+3*PS]
SSTR dest4_m, dest4
mov dest1, [dest1]
SSTR dest1_m, dest1
.loop32:
vpxor xp1, xp1
vpxor xp2, xp2
vpxor xp3, xp3
vpxor xp4, xp4
mov tmp, mul_array
xor vec_i, vec_i
.next_vect:
SLDR src, src_m
mov ptr, [src+vec_i]
XLDR x0, [ptr+pos] ;Get next source vector
add vec_i, PS
%ifidn PS,8 ;64-bit code
vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xgft4_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xgft4_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
; " Ax{00}, Ax{10}, ..., Ax{f0}
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
; " Cx{00}, Cx{10}, ..., Cx{f0}
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
; " Dx{00}, Dx{10}, ..., Dx{f0}
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
add tmp, 32
%else ;32-bit code
mov cl, 0x0f ;use ecx as a temp variable
vpinsrb xmask0fx, xmask0fx, ecx, 0
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xgft4_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xgft4_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
; " Ax{00}, Ax{10}, ..., Ax{f0}
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
%endif
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp1, xgft1_hi ;xp1 += partial
%ifidn PS,4 ; 32-bit code
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
%endif
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp2, xgft2_hi ;xp2 += partial
%ifidn PS,4 ; 32-bit code
sal vec, 1
vmovdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
; " Cx{00}, Cx{10}, ..., Cx{f0}
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
sar vec, 1
%endif
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
vpxor xp3, xgft3_hi ;xp3 += partial
%ifidn PS,4 ; 32-bit code
SLDR vskip3, vskip3_m
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
; " DX{00}, Dx{10}, ..., Dx{f0}
vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
add tmp, 32
%endif
vpshufb xgft4_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft4_hi, xgft4_lo ;GF add high and low partials
vpxor xp4, xgft4_hi ;xp4 += partial
cmp vec_i, vec
jl .next_vect
SLDR dest1, dest1_m
SLDR dest2, dest2_m
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
SLDR dest3, dest3_m
XSTR [dest3+pos], xp3
SLDR dest4, dest4_m
XSTR [dest4+pos], xp4
SLDR len, len_m
add pos, 32 ;Loop on 32 bytes at a time
cmp pos, len
jle .loop32
lea tmp, [len + 32]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-32
jmp .loop32 ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
;;; func core, ver, snum
slversion gf_4vect_dot_prod_avx2, 04, 05, 0198

View File

@ -0,0 +1,443 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_4vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
%define tmp5 r14 ; must be saved and restored
%define tmp6 r15 ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
push r14
push r15
%endmacro
%macro FUNC_RESTORE 0
pop r15
pop r14
pop r13
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define tmp5 rdi ; must be saved and restored
%define tmp6 rsi ; must be saved and restored
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define LOG_PS 3
%define stack_size 9*16 + 7*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
save_xmm128 xmm6, 0*16
save_xmm128 xmm7, 1*16
save_xmm128 xmm8, 2*16
save_xmm128 xmm9, 3*16
save_xmm128 xmm10, 4*16
save_xmm128 xmm11, 5*16
save_xmm128 xmm12, 6*16
save_xmm128 xmm13, 7*16
save_xmm128 xmm14, 8*16
save_reg r12, 9*16 + 0*8
save_reg r13, 9*16 + 1*8
save_reg r14, 9*16 + 2*8
save_reg r15, 9*16 + 3*8
save_reg rdi, 9*16 + 4*8
save_reg rsi, 9*16 + 5*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp + 0*16]
movdqa xmm7, [rsp + 1*16]
movdqa xmm8, [rsp + 2*16]
movdqa xmm9, [rsp + 3*16]
movdqa xmm10, [rsp + 4*16]
movdqa xmm11, [rsp + 5*16]
movdqa xmm12, [rsp + 6*16]
movdqa xmm13, [rsp + 7*16]
movdqa xmm14, [rsp + 8*16]
mov r12, [rsp + 9*16 + 0*8]
mov r13, [rsp + 9*16 + 1*8]
mov r14, [rsp + 9*16 + 2*8]
mov r15, [rsp + 9*16 + 3*8]
mov rdi, [rsp + 9*16 + 4*8]
mov rsi, [rsp + 9*16 + 5*8]
add rsp, stack_size
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, elf32
;;;================== High Address;
;;; arg4
;;; arg3
;;; arg2
;;; arg1
;;; arg0
;;; return
;;;<================= esp of caller
;;; ebp
;;;<================= ebp = esp
;;; var0
;;; var1
;;; var2
;;; var3
;;; esi
;;; edi
;;; ebx
;;;<================= esp of callee
;;;
;;;================== Low Address;
%define PS 4
%define LOG_PS 2
%define func(x) x:
%define arg(x) [ebp + PS*2 + PS*x]
%define var(x) [ebp - PS - PS*x]
%define trans ecx
%define trans2 esi
%define arg0 trans ;trans and trans2 are for the variables in stack
%define arg0_m arg(0)
%define arg1 ebx
%define arg2 arg2_m
%define arg2_m arg(2)
%define arg3 trans
%define arg3_m arg(3)
%define arg4 trans
%define arg4_m arg(4)
%define arg5 trans2
%define tmp edx
%define tmp2 edi
%define tmp3 trans2
%define tmp3_m var(0)
%define tmp4 trans2
%define tmp4_m var(1)
%define tmp5 trans2
%define tmp5_m var(2)
%define tmp6 trans2
%define tmp6_m var(3)
%define return eax
%macro SLDR 2 ;stack load/restore
mov %1, %2
%endmacro
%define SSTR SLDR
%macro FUNC_SAVE 0
push ebp
mov ebp, esp
sub esp, PS*4 ;4 local variables
push esi
push edi
push ebx
mov arg1, arg(1)
%endmacro
%macro FUNC_RESTORE 0
pop ebx
pop edi
pop esi
add esp, PS*4 ;4 local variables
pop ebp
%endmacro
%endif ; output formats
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest1 arg4
%define ptr arg5
%define vec_i tmp2
%define dest2 tmp3
%define dest3 tmp4
%define dest4 tmp5
%define vskip3 tmp6
%define pos return
%ifidn PS,4 ;32-bit code
%define len_m arg0_m
%define src_m arg3_m
%define dest1_m arg4_m
%define dest2_m tmp3_m
%define dest3_m tmp4_m
%define dest4_m tmp5_m
%define vskip3_m tmp6_m
%endif
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR movdqu
%define XSTR movdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR movdqa
%define XSTR movdqa
%else
%define XLDR movntdqa
%define XSTR movntdq
%endif
%endif
%ifidn PS,8 ; 64-bit code
default rel
[bits 64]
%endif
section .text
%ifidn PS,8 ;64-bit code
%define xmask0f xmm14
%define xgft1_lo xmm2
%define xgft1_hi xmm3
%define xgft2_lo xmm11
%define xgft2_hi xmm4
%define xgft3_lo xmm9
%define xgft3_hi xmm5
%define xgft4_lo xmm7
%define xgft4_hi xmm6
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm8
%define xp2 xmm10
%define xp3 xmm12
%define xp4 xmm13
%else
%define xmm_trans xmm7 ;reuse xmask0f and xgft1_lo
%define xmask0f xmm_trans
%define xgft1_lo xmm_trans
%define xgft1_hi xmm6
%define xgft2_lo xgft1_lo
%define xgft2_hi xgft1_hi
%define xgft3_lo xgft1_lo
%define xgft3_hi xgft1_hi
%define xgft4_lo xgft1_lo
%define xgft4_hi xgft1_hi
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm2
%define xp2 xmm3
%define xp3 xmm4
%define xp4 xmm5
%endif
align 16
global gf_4vect_dot_prod_sse:function
func(gf_4vect_dot_prod_sse)
FUNC_SAVE
SLDR len, len_m
sub len, 16
SSTR len_m, len
jl .return_fail
xor pos, pos
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
mov vskip3, vec
imul vskip3, 96
SSTR vskip3_m, vskip3
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
SLDR dest1, dest1_m
mov dest2, [dest1+PS]
SSTR dest2_m, dest2
mov dest3, [dest1+2*PS]
SSTR dest3_m, dest3
mov dest4, [dest1+3*PS]
SSTR dest4_m, dest4
mov dest1, [dest1]
SSTR dest1_m, dest1
.loop16:
pxor xp1, xp1
pxor xp2, xp2
pxor xp3, xp3
pxor xp4, xp4
mov tmp, mul_array
xor vec_i, vec_i
.next_vect:
SLDR src, src_m
mov ptr, [src+vec_i]
%ifidn PS,8 ;64-bit code
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
movdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
movdqu xgft3_hi, [tmp+vec*(64/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
movdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
movdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
XLDR x0, [ptr+pos] ;Get next source vector
add tmp, 32
add vec_i, PS
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
%else ;32-bit code
XLDR x0, [ptr+pos] ;Get next source vector
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
%endif
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
pxor xp1, xgft1_hi ;xp1 += partial
%ifidn PS,4 ;32-bit code
movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
%endif
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
pxor xp2, xgft2_hi ;xp2 += partial
%ifidn PS,4 ;32-bit code
sal vec, 1
movdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
movdqu xgft3_hi, [tmp+vec*(32/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
sar vec, 1
%endif
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft3_hi, xgft3_lo ;GF add high and low partials
pxor xp3, xgft3_hi ;xp3 += partial
%ifidn PS,4 ;32-bit code
SLDR vskip3, vskip3_m
movdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
movdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
add tmp, 32
add vec_i, PS
%endif
pshufb xgft4_hi, x0 ;Lookup mul table of high nibble
pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft4_hi, xgft4_lo ;GF add high and low partials
pxor xp4, xgft4_hi ;xp4 += partial
cmp vec_i, vec
jl .next_vect
SLDR dest1, dest1_m
SLDR dest2, dest2_m
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
SLDR dest3, dest3_m
XSTR [dest3+pos], xp3
SLDR dest4, dest4_m
XSTR [dest4+pos], xp4
SLDR len, len_m
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop16 ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_4vect_dot_prod_sse, 00, 06, 0064

View File

@ -0,0 +1,281 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "test.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_4vect_dot_prod_sse
#endif
#define str(s) #s
#define xstr(s) str(s)
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 10
# define TEST_LEN 8*1024
# define TEST_LOOPS 40000
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 10
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
# define TEST_LOOPS 100
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS 1000
# endif
# endif
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j;
void *buf;
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
u8 g4[TEST_SOURCES], g_tbls[4 * TEST_SOURCES * 32], *buffs[TEST_SOURCES];
u8 *dest1, *dest2, *dest3, *dest4, *dest_ref1, *dest_ref2, *dest_ref3;
u8 *dest_ref4, *dest_ptrs[4];
struct perf start, stop;
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest4 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref4 = buf;
dest_ptrs[0] = dest1;
dest_ptrs[1] = dest2;
dest_ptrs[2] = dest3;
dest_ptrs[3] = dest4;
// Performance test
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
memset(dest1, 0, TEST_LEN);
memset(dest2, 0, TEST_LEN);
memset(dest3, 0, TEST_LEN);
memset(dest4, 0, TEST_LEN);
memset(dest_ref1, 0, TEST_LEN);
memset(dest_ref2, 0, TEST_LEN);
memset(dest_ref3, 0, TEST_LEN);
memset(dest_ref4, 0, TEST_LEN);
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
}
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
dest_ref4);
#ifdef DO_REF_PERF
perf_start(&start);
for (i = 0; i < TEST_LOOPS / 100; i++) {
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
buffs, dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
buffs, dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
buffs, dest_ref4);
}
perf_stop(&stop);
printf("gf_4vect_dot_prod_base" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 4) * i);
#endif
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
}
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 4) * i);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest4, 25);
return -1;
}
printf("pass perf check\n");
return 0;
}

View File

@ -0,0 +1,692 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_4vect_dot_prod_sse
#endif
#ifndef TEST_MIN_SIZE
# define TEST_MIN_SIZE 16
#endif
#define str(s) #s
#define xstr(s) str(s)
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#define TEST_MEM TEST_SIZE
#define TEST_LOOPS 10000
#define TEST_TYPE_STR ""
#ifndef TEST_SOURCES
# define TEST_SOURCES 16
#endif
#ifndef RANDOMS
# define RANDOMS 20
#endif
#ifdef EC_ALIGNED_ADDR
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 0
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
#else
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 32
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j, rtest, srcs;
void *buf;
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
u8 g4[TEST_SOURCES], g_tbls[4 * TEST_SOURCES * 32], *buffs[TEST_SOURCES];
u8 *dest1, *dest2, *dest3, *dest4, *dest_ref1, *dest_ref2, *dest_ref3;
u8 *dest_ref4, *dest_ptrs[4];
int align, size;
unsigned char *efence_buffs[TEST_SOURCES];
unsigned int offset;
u8 *ubuffs[TEST_SOURCES];
u8 *udest_ptrs[4];
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest4 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref4 = buf;
dest_ptrs[0] = dest1;
dest_ptrs[1] = dest2;
dest_ptrs[2] = dest3;
dest_ptrs[3] = dest4;
// Test of all zeros
for (i = 0; i < TEST_SOURCES; i++)
memset(buffs[i], 0, TEST_LEN);
memset(dest1, 0, TEST_LEN);
memset(dest2, 0, TEST_LEN);
memset(dest3, 0, TEST_LEN);
memset(dest4, 0, TEST_LEN);
memset(dest_ref1, 0, TEST_LEN);
memset(dest_ref2, 0, TEST_LEN);
memset(dest_ref3, 0, TEST_LEN);
memset(dest_ref4, 0, TEST_LEN);
memset(g1, 2, TEST_SOURCES);
memset(g2, 1, TEST_SOURCES);
memset(g3, 7, TEST_SOURCES);
memset(g4, 3, TEST_SOURCES);
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
dest_ref4);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest4, 25);
return -1;
}
putchar('.');
// Rand data test
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
}
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
buffs, dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
buffs, dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
buffs, dest_ref4);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest4, 25);
return -1;
}
putchar('.');
}
// Rand data test with varied parameters
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
for (i = 0; i < srcs; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs,
dest_ref4);
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test1 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test2 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test3 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test4 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest4, 25);
return -1;
}
putchar('.');
}
}
// Run tests at end of buffer for Electric Fence
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 32;
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
efence_buffs[i] = buffs[i] + TEST_LEN - size;
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
}
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
}
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
efence_buffs, dest_ref2);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
efence_buffs, dest_ref3);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
efence_buffs, dest_ref4);
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, align);
printf("dprod_dut:");
dump(dest1, align);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, align);
printf("dprod_dut:");
dump(dest2, align);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, align);
printf("dprod_dut:");
dump(dest3, align);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, align);
printf("dprod_dut:");
dump(dest4, align);
return -1;
}
putchar('.');
}
// Test rand ptr alignment if available
for (rtest = 0; rtest < RANDOMS; rtest++) {
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
srcs = rand() % TEST_SOURCES;
if (srcs == 0)
continue;
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
// Add random offsets
for (i = 0; i < srcs; i++)
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset));
memset(dest1, 0, TEST_LEN); // zero pad to check write-over
memset(dest2, 0, TEST_LEN);
memset(dest3, 0, TEST_LEN);
memset(dest4, 0, TEST_LEN);
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
ubuffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
if (memcmp(dest_ref1, udest_ptrs[0], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(udest_ptrs[0], 25);
return -1;
}
if (memcmp(dest_ref2, udest_ptrs[1], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(udest_ptrs[1], 25);
return -1;
}
if (memcmp(dest_ref3, udest_ptrs[2], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(udest_ptrs[2], 25);
return -1;
}
if (memcmp(dest_ref4, udest_ptrs[3], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(udest_ptrs[3], 25);
return -1;
}
// Confirm that padding around dests is unchanged
memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
offset = udest_ptrs[0] - dest1;
if (memcmp(dest1, dest_ref1, offset)) {
printf("Fail rand ualign pad1 start\n");
return -1;
}
if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad1 end\n");
printf("size=%d offset=%d srcs=%d\n", size, offset, srcs);
return -1;
}
offset = udest_ptrs[1] - dest2;
if (memcmp(dest2, dest_ref1, offset)) {
printf("Fail rand ualign pad2 start\n");
return -1;
}
if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad2 end\n");
return -1;
}
offset = udest_ptrs[2] - dest3;
if (memcmp(dest3, dest_ref1, offset)) {
printf("Fail rand ualign pad3 start\n");
return -1;
}
if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad3 end\n");
return -1;
}
offset = udest_ptrs[3] - dest4;
if (memcmp(dest4, dest_ref1, offset)) {
printf("Fail rand ualign pad4 start\n");
return -1;
}
if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad4 end\n");
return -1;
}
putchar('.');
}
// Test all size alignment
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 32;
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
srcs = TEST_SOURCES;
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
if (memcmp(dest_ref1, dest_ptrs[0], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest_ptrs[0], 25);
return -1;
}
if (memcmp(dest_ref2, dest_ptrs[1], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest_ptrs[1], 25);
return -1;
}
if (memcmp(dest_ref3, dest_ptrs[2], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest_ptrs[2], 25);
return -1;
}
if (memcmp(dest_ref4, dest_ptrs[3], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest_ptrs[3], 25);
return -1;
}
}
printf("Pass\n");
return 0;
}

View File

@ -0,0 +1,336 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_4vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define tmp2 r10
%define tmp3 r13
%define return rax
%define return.w eax
%define stack_size 16*10 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
movdqa [rsp+16*0],xmm6
movdqa [rsp+16*1],xmm7
movdqa [rsp+16*2],xmm8
movdqa [rsp+16*3],xmm9
movdqa [rsp+16*4],xmm10
movdqa [rsp+16*5],xmm11
movdqa [rsp+16*6],xmm12
movdqa [rsp+16*7],xmm13
movdqa [rsp+16*8],xmm14
movdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r15, 10*16 + 2*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp+16*0]
movdqa xmm7, [rsp+16*1]
movdqa xmm8, [rsp+16*2]
movdqa xmm9, [rsp+16*3]
movdqa xmm10, [rsp+16*4]
movdqa xmm11, [rsp+16*5]
movdqa xmm12, [rsp+16*6]
movdqa xmm13, [rsp+16*7]
movdqa xmm14, [rsp+16*8]
movdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8]
mov r15, [rsp + 10*16 + 2*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r12
%define return rax
%define return.w eax
%define func(x) x:
%macro FUNC_SAVE 0
push r12
%endmacro
%macro FUNC_RESTORE 0
pop r12
%endmacro
%endif
;;; gf_4vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 mul_array
%define dest3 tmp2
%define dest4 vec_i
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft3_hi xmm14
%define xgft4_hi xmm13
%define xgft4_lo xmm12
%define x0 xmm0
%define xtmpa xmm1
%define xtmph1 xmm2
%define xtmpl1 xmm3
%define xtmph2 xmm4
%define xtmpl2 xmm5
%define xtmph3 xmm6
%define xtmpl3 xmm7
%define xtmph4 xmm8
%define xtmpl4 xmm9
%define xd1 xmm10
%define xd2 xmm11
%define xd3 xtmph1
%define xd4 xtmpl1
align 16
global gf_4vect_mad_avx:function
func(gf_4vect_mad_avx)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
mov tmp, vec
sal vec_i, 5 ;Multiply by 32
lea tmp3, [mul_array + vec_i]
sal tmp, 6 ;Multiply by 64
vmovdqu xgft3_hi, [tmp3+tmp+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
sal vec, 5 ;Multiply by 32
add tmp, vec
vmovdqu xgft4_lo, [tmp3+tmp] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
vmovdqu xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
mov dest2, [dest1+PS] ; reuse mul_array
mov dest3, [dest1+2*PS]
mov dest4, [dest1+3*PS] ; reuse vec_i
mov dest1, [dest1]
.loop16:
XLDR x0, [src+pos] ;Get next source vector
vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
XLDR xd1, [dest1+pos] ;Get next dest vector
XLDR xd2, [dest2+pos] ;Get next dest vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
; dest1
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xtmpl1, xtmpl1, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
vpxor xd1, xd1, xtmph1
XLDR xd3, [dest3+pos] ;Reuse xtmph1, Get next dest vector
XLDR xd4, [dest4+pos] ;Reuse xtmpl1, Get next dest vector
; dest2
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xtmpl2, xtmpl2, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
vpxor xd2, xd2, xtmph2
; dest3
vpshufb xtmph3, xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl3, xtmpl3, xtmpa ;Lookup mul table of low nibble
vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials
vpxor xd3, xd3, xtmph3
; dest4
vpshufb xtmph4, xgft4_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl4, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph4, xtmph4, xtmpl4 ;GF add high and low partials
vpxor xd4, xd4, xtmph4
XSTR [dest1+pos], xd1 ;Store result
XSTR [dest2+pos], xd2 ;Store result
XSTR [dest3+pos], xd3 ;Store result
XSTR [dest4+pos], xd4 ;Store result
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
.lessthan16:
;; Tail len
;; Do one more overlap pass
mov tmp, len ;Overlapped offset length-16
XLDR x0, [src+tmp] ;Get next source vector
vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
XLDR xd1, [dest1+tmp] ;Get next dest vector
XLDR xd2, [dest2+tmp] ;Get next dest vector
XLDR xtmph4, [dest3+tmp] ;Get next dest vector
sub len, pos
vmovdqa xtmpl4, [constip16] ;Load const of i + 16
vpinsrb xtmph3, xtmph3, len.w, 15
vpshufb xtmph3, xtmph3, xmask0f ;Broadcast len to all bytes
vpcmpgtb xtmph3, xtmph3, xtmpl4
XLDR xtmpl4, [dest4+tmp] ;Get next dest vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
; dest1
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xtmpl1, xtmpl1, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
vpand xtmph1, xtmph1, xtmph3
vpxor xd1, xd1, xtmph1
; dest2
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xtmpl2, xtmpl2, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
vpand xtmph2, xtmph2, xtmph3
vpxor xd2, xd2, xtmph2
; dest3
vpshufb xgft3_hi, xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl3, xtmpl3, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_hi, xtmpl3 ;GF add high and low partials
vpand xgft3_hi, xgft3_hi, xtmph3
vpxor xtmph4, xtmph4, xgft3_hi
; dest4
vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials
vpand xgft4_hi, xgft4_hi, xtmph3
vpxor xtmpl4, xtmpl4, xgft4_hi
XSTR [dest1+tmp], xd1 ;Store result
XSTR [dest2+tmp], xd2 ;Store result
XSTR [dest3+tmp], xtmph4 ;Store result
XSTR [dest4+tmp], xtmpl4 ;Store result
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
constip16:
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
;;; func core, ver, snum
slversion gf_4vect_mad_avx, 02, 01, 020a

View File

@ -0,0 +1,342 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_4vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define return rax
%define return.w eax
%define stack_size 16*10 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
movdqa [rsp+16*0],xmm6
movdqa [rsp+16*1],xmm7
movdqa [rsp+16*2],xmm8
movdqa [rsp+16*3],xmm9
movdqa [rsp+16*4],xmm10
movdqa [rsp+16*5],xmm11
movdqa [rsp+16*6],xmm12
movdqa [rsp+16*7],xmm13
movdqa [rsp+16*8],xmm14
movdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8
save_reg r15, 10*16 + 1*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp+16*0]
movdqa xmm7, [rsp+16*1]
movdqa xmm8, [rsp+16*2]
movdqa xmm9, [rsp+16*3]
movdqa xmm10, [rsp+16*4]
movdqa xmm11, [rsp+16*5]
movdqa xmm12, [rsp+16*6]
movdqa xmm13, [rsp+16*7]
movdqa xmm14, [rsp+16*8]
movdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8]
mov r15, [rsp + 10*16 + 1*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define return rax
%define return.w eax
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
;;; gf_4vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 mul_array
%define dest3 vec
%define dest4 vec_i
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f ymm15
%define xmask0fx xmm15
%define xgft1_lo ymm14
%define xgft2_lo ymm13
%define xgft3_lo ymm12
%define xgft4_lo ymm11
%define x0 ymm0
%define xtmpa ymm1
%define xtmpl ymm2
%define xtmplx xmm2
%define xtmph1 ymm3
%define xtmph1x xmm3
%define xtmph2 ymm4
%define xtmph3 ymm5
%define xtmph4 ymm6
%define xd1 ymm7
%define xd2 ymm8
%define xd3 ymm9
%define xd4 ymm10
align 16
global gf_4vect_mad_avx2:function
func(gf_4vect_mad_avx2)
FUNC_SAVE
sub len, 32
jl .return_fail
xor pos, pos
mov tmp.b, 0x0f
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
sal vec_i, 5 ;Multiply by 32
sal vec, 5 ;Multiply by 32
lea tmp, [mul_array + vec_i]
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
add tmp, vec
vmovdqu xgft4_lo, [tmp+2*vec] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
mov dest2, [dest1+PS] ; reuse mul_array
mov dest3, [dest1+2*PS] ; reuse vec
mov dest4, [dest1+3*PS] ; reuse vec_i
mov dest1, [dest1]
.loop32:
XLDR x0, [src+pos] ;Get next source vector
XLDR xd1, [dest1+pos] ;Get next dest vector
XLDR xd2, [dest2+pos] ;Get next dest vector
XLDR xd3, [dest3+pos] ;Get next dest vector
XLDR xd4, [dest4+pos] ;reuse xtmpl1. Get next dest vector
vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
; dest1
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials
vpxor xd1, xd1, xtmph1 ;xd1 += partial
; dest2
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials
vpxor xd2, xd2, xtmph2 ;xd2 += partial
; dest3
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph3, xtmph3, xtmpl ;GF add high and low partials
vpxor xd3, xd3, xtmph3 ;xd3 += partial
; dest4
vpshufb xtmph4, xtmph4, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph4, xtmph4, xtmpl ;GF add high and low partials
vpxor xd4, xd4, xtmph4 ;xd4 += partial
XSTR [dest1+pos], xd1
XSTR [dest2+pos], xd2
XSTR [dest3+pos], xd3
XSTR [dest4+pos], xd4
add pos, 32 ;Loop on 32 bytes at a time
cmp pos, len
jle .loop32
lea tmp, [len + 32]
cmp pos, tmp
je .return_pass
.lessthan32:
;; Tail len
;; Do one more overlap pass
mov tmp.b, 0x1f
vpinsrb xtmph1x, xtmph1x, tmp.w, 0
vpbroadcastb xtmph1, xtmph1x ;Construct mask 0x1f1f1f...
mov tmp, len ;Overlapped offset length-32
XLDR x0, [src+tmp] ;Get next source vector
XLDR xd1, [dest1+tmp] ;Get next dest vector
XLDR xd2, [dest2+tmp] ;Get next dest vector
XLDR xd3, [dest3+tmp] ;Get next dest vector
XLDR xd4, [dest4+tmp] ;Get next dest vector
sub len, pos
vmovdqa xtmph2, [constip32] ;Load const of i + 32
vpinsrb xtmplx, xtmplx, len.w, 15
vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f...
vpcmpgtb xtmpl, xtmpl, xtmph2
vpand xtmph1, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xtmph1, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xtmph1, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
; dest1
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xgft1_lo ;GF add high and low partials
vpand xtmph1, xtmph1, xtmpl
vpxor xd1, xd1, xtmph1 ;xd1 += partial
; dest2
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xgft2_lo ;GF add high and low partials
vpand xtmph2, xtmph2, xtmpl
vpxor xd2, xd2, xtmph2 ;xd2 += partial
; dest3
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph3, xtmph3, xgft3_lo ;GF add high and low partials
vpand xtmph3, xtmph3, xtmpl
vpxor xd3, xd3, xtmph3 ;xd3 += partial
; dest4
vpshufb xtmph4, xtmph4, x0 ;Lookup mul table of high nibble
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph4, xtmph4, xgft4_lo ;GF add high and low partials
vpand xtmph4, xtmph4, xtmpl
vpxor xd4, xd4, xtmph4 ;xd4 += partial
XSTR [dest1+tmp], xd1
XSTR [dest2+tmp], xd2
XSTR [dest3+tmp], xd3
XSTR [dest4+tmp], xd4
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 32
constip32:
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
;;; func core, ver, snum
slversion gf_4vect_mad_avx2, 04, 01, 020b

View File

@ -0,0 +1,342 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define tmp2 r10
%define tmp3 r13
%define return rax
%define return.w eax
%define stack_size 16*10 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
movdqa [rsp+16*0],xmm6
movdqa [rsp+16*1],xmm7
movdqa [rsp+16*2],xmm8
movdqa [rsp+16*3],xmm9
movdqa [rsp+16*4],xmm10
movdqa [rsp+16*5],xmm11
movdqa [rsp+16*6],xmm12
movdqa [rsp+16*7],xmm13
movdqa [rsp+16*8],xmm14
movdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r15, 10*16 + 2*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp+16*0]
movdqa xmm7, [rsp+16*1]
movdqa xmm8, [rsp+16*2]
movdqa xmm9, [rsp+16*3]
movdqa xmm10, [rsp+16*4]
movdqa xmm11, [rsp+16*5]
movdqa xmm12, [rsp+16*6]
movdqa xmm13, [rsp+16*7]
movdqa xmm14, [rsp+16*8]
movdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8]
mov r15, [rsp + 10*16 + 2*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r12
%define return rax
%define return.w eax
%define func(x) x:
%macro FUNC_SAVE 0
push r12
%endmacro
%macro FUNC_RESTORE 0
pop r12
%endmacro
%endif
;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 mul_array
%define dest3 tmp2
%define dest4 vec_i
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR movdqu
%define XSTR movdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR movdqa
%define XSTR movdqa
%else
%define XLDR movntdqa
%define XSTR movntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft3_hi xmm14
%define xgft4_hi xmm13
%define xgft4_lo xmm12
%define x0 xmm0
%define xtmpa xmm1
%define xtmph1 xmm2
%define xtmpl1 xmm3
%define xtmph2 xmm4
%define xtmpl2 xmm5
%define xtmph3 xmm6
%define xtmpl3 xmm7
%define xtmph4 xmm8
%define xtmpl4 xmm9
%define xd1 xmm10
%define xd2 xmm11
%define xd3 xtmph1
%define xd4 xtmpl1
align 16
global gf_4vect_mad_sse:function
func(gf_4vect_mad_sse)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
mov tmp, vec
sal vec_i, 5 ;Multiply by 32
lea tmp3, [mul_array + vec_i]
sal tmp, 6 ;Multiply by 64
movdqu xgft3_hi, [tmp3+tmp+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
sal vec, 5 ;Multiply by 32
add tmp, vec
movdqu xgft4_lo, [tmp3+tmp] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
movdqu xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
mov dest2, [dest1+PS] ; reuse mul_array
mov dest3, [dest1+2*PS]
mov dest4, [dest1+3*PS] ; reuse vec_i
mov dest1, [dest1]
.loop16:
XLDR x0, [src+pos] ;Get next source vector
movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
movdqa xtmph3, xgft3_hi
movdqa xtmpl4, xgft4_lo
movdqa xtmph4, xgft4_hi
XLDR xd1, [dest1+pos] ;Get next dest vector
XLDR xd2, [dest2+pos] ;Get next dest vector
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
; dest1
pshufb xtmph1, x0 ;Lookup mul table of high nibble
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
pxor xtmph1, xtmpl1 ;GF add high and low partials
pxor xd1, xtmph1
XLDR xd3, [dest3+pos] ;Reuse xtmph1, Get next dest vector
XLDR xd4, [dest4+pos] ;Reuse xtmpl1, Get next dest vector
; dest2
pshufb xtmph2, x0 ;Lookup mul table of high nibble
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
pxor xtmph2, xtmpl2 ;GF add high and low partials
pxor xd2, xtmph2
; dest3
pshufb xtmph3, x0 ;Lookup mul table of high nibble
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
pxor xtmph3, xtmpl3 ;GF add high and low partials
pxor xd3, xtmph3
; dest4
pshufb xtmph4, x0 ;Lookup mul table of high nibble
pshufb xtmpl4, xtmpa ;Lookup mul table of low nibble
pxor xtmph4, xtmpl4 ;GF add high and low partials
pxor xd4, xtmph4
XSTR [dest1+pos], xd1 ;Store result
XSTR [dest2+pos], xd2 ;Store result
XSTR [dest3+pos], xd3 ;Store result
XSTR [dest4+pos], xd4 ;Store result
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
.lessthan16:
;; Tail len
;; Do one more overlap pass
mov tmp, len ;Overlapped offset length-16
XLDR x0, [src+tmp] ;Get next source vector
movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
XLDR xd1, [dest1+tmp] ;Get next dest vector
XLDR xd2, [dest2+tmp] ;Get next dest vector
XLDR xtmph4, [dest3+tmp] ;Reuse xtmph1. Get next dest vector
sub len, pos
movdqa xtmpl4, [constip16] ;Load const of i + 16
pinsrb xtmph3, len.w, 15
pshufb xtmph3, xmask0f ;Broadcast len to all bytes
pcmpgtb xtmph3, xtmpl4
XLDR xtmpl4, [dest4+tmp] ;Get next dest vector
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
; dest1
pshufb xtmph1, x0 ;Lookup mul table of high nibble
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
pxor xtmph1, xtmpl1 ;GF add high and low partials
pand xtmph1, xtmph3
pxor xd1, xtmph1
; dest2
pshufb xtmph2, x0 ;Lookup mul table of high nibble
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
pxor xtmph2, xtmpl2 ;GF add high and low partials
pand xtmph2, xtmph3
pxor xd2, xtmph2
; dest3
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
pxor xgft3_hi, xtmpl3 ;GF add high and low partials
pand xgft3_hi, xtmph3
pxor xtmph4, xgft3_hi
; dest4
pshufb xgft4_hi, x0 ;Lookup mul table of high nibble
pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft4_hi, xgft4_lo ;GF add high and low partials
pand xgft4_hi, xtmph3
pxor xtmpl4, xgft4_hi
XSTR [dest1+tmp], xd1 ;Store result
XSTR [dest2+tmp], xd2 ;Store result
XSTR [dest3+tmp], xtmph4 ;Store result
XSTR [dest4+tmp], xtmpl4 ;Store result
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 16
mask0f:
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
constip16:
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
;;; func core, ver, snum
slversion gf_4vect_mad_sse, 00, 01, 0209

View File

@ -0,0 +1,303 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_5vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
%define tmp5 r14 ; must be saved and restored
%define tmp6 r15 ; must be saved and restored
%define return rax
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
push r14
push r15
%endmacro
%macro FUNC_RESTORE 0
pop r15
pop r14
pop r13
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define tmp5 rdi ; must be saved and restored
%define tmp6 rsi ; must be saved and restored
%define return rax
%define PS 8
%define LOG_PS 3
%define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
save_xmm128 xmm6, 0*16
save_xmm128 xmm7, 1*16
save_xmm128 xmm8, 2*16
save_xmm128 xmm9, 3*16
save_xmm128 xmm10, 4*16
save_xmm128 xmm11, 5*16
save_xmm128 xmm12, 6*16
save_xmm128 xmm13, 7*16
save_xmm128 xmm14, 8*16
save_xmm128 xmm15, 9*16
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r14, 10*16 + 2*8
save_reg r15, 10*16 + 3*8
save_reg rdi, 10*16 + 4*8
save_reg rsi, 10*16 + 5*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp + 0*16]
vmovdqa xmm7, [rsp + 1*16]
vmovdqa xmm8, [rsp + 2*16]
vmovdqa xmm9, [rsp + 3*16]
vmovdqa xmm10, [rsp + 4*16]
vmovdqa xmm11, [rsp + 5*16]
vmovdqa xmm12, [rsp + 6*16]
vmovdqa xmm13, [rsp + 7*16]
vmovdqa xmm14, [rsp + 8*16]
vmovdqa xmm15, [rsp + 9*16]
mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8]
mov r14, [rsp + 10*16 + 2*8]
mov r15, [rsp + 10*16 + 3*8]
mov rdi, [rsp + 10*16 + 4*8]
mov rsi, [rsp + 10*16 + 5*8]
add rsp, stack_size
%endmacro
%endif
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest arg4
%define ptr arg5
%define vec_i tmp2
%define dest1 tmp3
%define dest2 tmp4
%define vskip1 tmp5
%define vskip3 tmp6
%define pos return
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft1_lo xmm14
%define xgft1_hi xmm13
%define xgft2_lo xmm12
%define xgft2_hi xmm11
%define xgft3_lo xmm10
%define xgft3_hi xmm9
%define xgft4_lo xmm8
%define xgft4_hi xmm7
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm2
%define xp2 xmm3
%define xp3 xmm4
%define xp4 xmm5
%define xp5 xmm6
align 16
global gf_5vect_dot_prod_avx:function
func(gf_5vect_dot_prod_avx)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
mov vskip1, vec
imul vskip1, 32
mov vskip3, vec
imul vskip3, 96
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
mov dest1, [dest]
mov dest2, [dest+PS]
.loop16:
mov tmp, mul_array
xor vec_i, vec_i
vpxor xp1, xp1
vpxor xp2, xp2
vpxor xp3, xp3
vpxor xp4, xp4
vpxor xp5, xp5
.next_vect:
mov ptr, [src+vec_i]
add vec_i, PS
XLDR x0, [ptr+pos] ;Get next source vector
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
vmovdqu xgft2_hi, [tmp+vskip1*1+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
vmovdqu xgft3_hi, [tmp+vskip1*2+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
vmovdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp1, xgft1_hi ;xp1 += partial
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp2, xgft2_hi ;xp2 += partial
vmovdqu xgft1_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
vmovdqu xgft1_hi, [tmp+vskip1*4+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
add tmp, 32
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
vpxor xp3, xgft3_hi ;xp3 += partial
vpshufb xgft4_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft4_hi, xgft4_lo ;GF add high and low partials
vpxor xp4, xgft4_hi ;xp4 += partial
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp5, xgft1_hi ;xp5 += partial
cmp vec_i, vec
jl .next_vect
mov tmp, [dest+2*PS]
mov ptr, [dest+3*PS]
mov vec_i, [dest+4*PS]
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
XSTR [tmp+pos], xp3
XSTR [ptr+pos], xp4
XSTR [vec_i+pos], xp5
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop16 ;Do one more overlap pass
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_5vect_dot_prod_avx, 02, 04, 0194

View File

@ -0,0 +1,315 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_5vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
%define tmp5 r14 ; must be saved and restored
%define tmp6 r15 ; must be saved and restored
%define return rax
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
push r14
push r15
%endmacro
%macro FUNC_RESTORE 0
pop r15
pop r14
pop r13
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define tmp5 rdi ; must be saved and restored
%define tmp6 rsi ; must be saved and restored
%define return rax
%define PS 8
%define LOG_PS 3
%define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
vmovdqa [rsp + 0*16], xmm6
vmovdqa [rsp + 1*16], xmm7
vmovdqa [rsp + 2*16], xmm8
vmovdqa [rsp + 3*16], xmm9
vmovdqa [rsp + 4*16], xmm10
vmovdqa [rsp + 5*16], xmm11
vmovdqa [rsp + 6*16], xmm12
vmovdqa [rsp + 7*16], xmm13
vmovdqa [rsp + 8*16], xmm14
vmovdqa [rsp + 9*16], xmm15
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r14, 10*16 + 2*8
save_reg r15, 10*16 + 3*8
save_reg rdi, 10*16 + 4*8
save_reg rsi, 10*16 + 5*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp + 0*16]
vmovdqa xmm7, [rsp + 1*16]
vmovdqa xmm8, [rsp + 2*16]
vmovdqa xmm9, [rsp + 3*16]
vmovdqa xmm10, [rsp + 4*16]
vmovdqa xmm11, [rsp + 5*16]
vmovdqa xmm12, [rsp + 6*16]
vmovdqa xmm13, [rsp + 7*16]
vmovdqa xmm14, [rsp + 8*16]
vmovdqa xmm15, [rsp + 9*16]
mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8]
mov r14, [rsp + 10*16 + 2*8]
mov r15, [rsp + 10*16 + 3*8]
mov rdi, [rsp + 10*16 + 4*8]
mov rsi, [rsp + 10*16 + 5*8]
add rsp, stack_size
%endmacro
%endif
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest arg4
%define ptr arg5
%define vec_i tmp2
%define dest1 tmp3
%define dest2 tmp4
%define vskip1 tmp5
%define vskip3 tmp6
%define pos return
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f ymm15
%define xmask0fx xmm15
%define xgft1_lo ymm14
%define xgft1_hi ymm13
%define xgft2_lo ymm12
%define xgft2_hi ymm11
%define xgft3_lo ymm10
%define xgft3_hi ymm9
%define xgft4_lo ymm8
%define xgft4_hi ymm7
%define x0 ymm0
%define xtmpa ymm1
%define xp1 ymm2
%define xp2 ymm3
%define xp3 ymm4
%define xp4 ymm5
%define xp5 ymm6
align 16
global gf_5vect_dot_prod_avx2:function
func(gf_5vect_dot_prod_avx2)
FUNC_SAVE
sub len, 32
jl .return_fail
xor pos, pos
mov tmp.b, 0x0f
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
mov vskip1, vec
imul vskip1, 32
mov vskip3, vec
imul vskip3, 96
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
mov dest1, [dest]
mov dest2, [dest+PS]
.loop32:
mov tmp, mul_array
xor vec_i, vec_i
vpxor xp1, xp1
vpxor xp2, xp2
vpxor xp3, xp3
vpxor xp4, xp4
vpxor xp5, xp5
.next_vect:
mov ptr, [src+vec_i]
XLDR x0, [ptr+pos] ;Get next source vector
add vec_i, PS
vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xgft4_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xgft4_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
; " Ax{00}, Ax{10}, ..., Ax{f0}
vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
; " Cx{00}, Cx{10}, ..., Cx{f0}
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
; " Dx{00}, Dx{10}, ..., Dx{f0}
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp1, xgft1_hi ;xp1 += partial
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp2, xgft2_hi ;xp2 += partial
vmovdqu xgft1_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
; " Ex{00}, Ex{10}, ..., Ex{f0}
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
add tmp, 32
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
vpxor xp3, xgft3_hi ;xp3 += partial
vpshufb xgft4_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft4_hi, xgft4_lo ;GF add high and low partials
vpxor xp4, xgft4_hi ;xp4 += partial
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp5, xgft1_hi ;xp5 += partial
cmp vec_i, vec
jl .next_vect
mov tmp, [dest+2*PS]
mov ptr, [dest+3*PS]
mov vec_i, [dest+4*PS]
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
XSTR [tmp+pos], xp3
XSTR [ptr+pos], xp4
XSTR [vec_i+pos], xp5
add pos, 32 ;Loop on 32 bytes at a time
cmp pos, len
jle .loop32
lea tmp, [len + 32]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop32 ;Do one more overlap pass
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
;;; func core, ver, snum
slversion gf_5vect_dot_prod_avx2, 04, 04, 0199

View File

@ -0,0 +1,304 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_5vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
%define tmp5 r14 ; must be saved and restored
%define tmp6 r15 ; must be saved and restored
%define return rax
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
push r14
push r15
%endmacro
%macro FUNC_RESTORE 0
pop r15
pop r14
pop r13
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define tmp5 rdi ; must be saved and restored
%define tmp6 rsi ; must be saved and restored
%define return rax
%define PS 8
%define LOG_PS 3
%define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
save_xmm128 xmm6, 0*16
save_xmm128 xmm7, 1*16
save_xmm128 xmm8, 2*16
save_xmm128 xmm9, 3*16
save_xmm128 xmm10, 4*16
save_xmm128 xmm11, 5*16
save_xmm128 xmm12, 6*16
save_xmm128 xmm13, 7*16
save_xmm128 xmm14, 8*16
save_xmm128 xmm15, 9*16
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r14, 10*16 + 2*8
save_reg r15, 10*16 + 3*8
save_reg rdi, 10*16 + 4*8
save_reg rsi, 10*16 + 5*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp + 0*16]
movdqa xmm7, [rsp + 1*16]
movdqa xmm8, [rsp + 2*16]
movdqa xmm9, [rsp + 3*16]
movdqa xmm10, [rsp + 4*16]
movdqa xmm11, [rsp + 5*16]
movdqa xmm12, [rsp + 6*16]
movdqa xmm13, [rsp + 7*16]
movdqa xmm14, [rsp + 8*16]
movdqa xmm15, [rsp + 9*16]
mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8]
mov r14, [rsp + 10*16 + 2*8]
mov r15, [rsp + 10*16 + 3*8]
mov rdi, [rsp + 10*16 + 4*8]
mov rsi, [rsp + 10*16 + 5*8]
add rsp, stack_size
%endmacro
%endif
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest arg4
%define ptr arg5
%define vec_i tmp2
%define dest1 tmp3
%define dest2 tmp4
%define vskip1 tmp5
%define vskip3 tmp6
%define pos return
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR movdqu
%define XSTR movdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR movdqa
%define XSTR movdqa
%else
%define XLDR movntdqa
%define XSTR movntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft1_lo xmm2
%define xgft1_hi xmm3
%define xgft2_lo xmm4
%define xgft2_hi xmm5
%define xgft3_lo xmm10
%define xgft3_hi xmm6
%define xgft4_lo xmm8
%define xgft4_hi xmm7
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm9
%define xp2 xmm11
%define xp3 xmm12
%define xp4 xmm13
%define xp5 xmm14
align 16
global gf_5vect_dot_prod_sse:function
func(gf_5vect_dot_prod_sse)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
mov vskip1, vec
imul vskip1, 32
mov vskip3, vec
imul vskip3, 96
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
mov dest1, [dest]
mov dest2, [dest+PS]
.loop16:
mov tmp, mul_array
xor vec_i, vec_i
pxor xp1, xp1
pxor xp2, xp2
pxor xp3, xp3
pxor xp4, xp4
pxor xp5, xp5
.next_vect:
mov ptr, [src+vec_i]
add vec_i, PS
XLDR x0, [ptr+pos] ;Get next source vector
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
movdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
movdqu xgft2_hi, [tmp+vskip1*1+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
movdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
movdqu xgft3_hi, [tmp+vskip1*2+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
movdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
movdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
pxor xp1, xgft1_hi ;xp1 += partial
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
pxor xp2, xgft2_hi ;xp2 += partial
movdqu xgft1_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
movdqu xgft1_hi, [tmp+vskip1*4+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
add tmp, 32
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft3_hi, xgft3_lo ;GF add high and low partials
pxor xp3, xgft3_hi ;xp3 += partial
pshufb xgft4_hi, x0 ;Lookup mul table of high nibble
pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft4_hi, xgft4_lo ;GF add high and low partials
pxor xp4, xgft4_hi ;xp4 += partial
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
pxor xp5, xgft1_hi ;xp5 += partial
cmp vec_i, vec
jl .next_vect
mov tmp, [dest+2*PS]
mov ptr, [dest+3*PS]
mov vec_i, [dest+4*PS]
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
XSTR [tmp+pos], xp3
XSTR [ptr+pos], xp4
XSTR [vec_i+pos], xp5
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop16 ;Do one more overlap pass
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_5vect_dot_prod_sse, 00, 05, 0065

View File

@ -0,0 +1,319 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "test.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_5vect_dot_prod_sse
#endif
#define str(s) #s
#define xstr(s) str(s)
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 10
# define TEST_LEN 8*1024
# define TEST_LOOPS 40000
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 10
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
# define TEST_LOOPS 100
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS 1000
# endif
# endif
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j;
void *buf;
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
u8 g4[TEST_SOURCES], g5[TEST_SOURCES], *g_tbls, *buffs[TEST_SOURCES];
u8 *dest1, *dest2, *dest3, *dest4, *dest5, *dest_ref1, *dest_ref2;
u8 *dest_ref3, *dest_ref4, *dest_ref5, *dest_ptrs[5];
struct perf start, stop;
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 16, 6 * TEST_SOURCES * 32)) {
printf("alloc error: Fail");
return -1;
}
g_tbls = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest4 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest5 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref4 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref5 = buf;
dest_ptrs[0] = dest1;
dest_ptrs[1] = dest2;
dest_ptrs[2] = dest3;
dest_ptrs[3] = dest4;
dest_ptrs[4] = dest5;
// Performance test
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
memset(dest1, 0, TEST_LEN);
memset(dest2, 0, TEST_LEN);
memset(dest3, 0, TEST_LEN);
memset(dest4, 0, TEST_LEN);
memset(dest5, 0, TEST_LEN);
memset(dest_ref1, 0, TEST_LEN);
memset(dest_ref2, 0, TEST_LEN);
memset(dest_ref3, 0, TEST_LEN);
memset(dest_ref4, 0, TEST_LEN);
memset(dest_ref5, 0, TEST_LEN);
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
g5[i] = rand();
}
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
dest_ref4);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
dest_ref5);
#ifdef DO_REF_PERF
perf_start(&start);
for (i = 0; i < TEST_LOOPS / 20; i++) {
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
buffs, dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
buffs, dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
buffs, dest_ref4);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
buffs, dest_ref5);
}
perf_stop(&stop);
printf("gf_5vect_dot_prod_base" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 5) * i);
#endif
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
}
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 5) * i);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test4\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest4, 25);
return -1;
}
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test5\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, 25);
printf("dprod_dut:");
dump(dest5, 25);
return -1;
}
printf("pass perf check\n");
return 0;
}

View File

@ -0,0 +1,805 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_5vect_dot_prod_sse
#endif
#ifndef TEST_MIN_SIZE
# define TEST_MIN_SIZE 16
#endif
#define str(s) #s
#define xstr(s) str(s)
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#define TEST_MEM TEST_SIZE
#define TEST_LOOPS 20000
#define TEST_TYPE_STR ""
#ifndef TEST_SOURCES
# define TEST_SOURCES 16
#endif
#ifndef RANDOMS
# define RANDOMS 20
#endif
#ifdef EC_ALIGNED_ADDR
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 0
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
#else
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 32
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j, rtest, srcs;
void *buf;
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
u8 g4[TEST_SOURCES], g5[TEST_SOURCES], *g_tbls;
u8 *dest1, *dest2, *dest3, *dest4, *dest5, *buffs[TEST_SOURCES];
u8 *dest_ref1, *dest_ref2, *dest_ref3, *dest_ref4, *dest_ref5;
u8 *dest_ptrs[5];
int align, size;
unsigned char *efence_buffs[TEST_SOURCES];
unsigned int offset;
u8 *ubuffs[TEST_SOURCES];
u8 *udest_ptrs[5];
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 16, 2 * (6 * TEST_SOURCES * 32))) {
printf("alloc error: Fail");
return -1;
}
g_tbls = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest4 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest5 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref4 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref5 = buf;
dest_ptrs[0] = dest1;
dest_ptrs[1] = dest2;
dest_ptrs[2] = dest3;
dest_ptrs[3] = dest4;
dest_ptrs[4] = dest5;
// Test of all zeros
for (i = 0; i < TEST_SOURCES; i++)
memset(buffs[i], 0, TEST_LEN);
memset(dest1, 0, TEST_LEN);
memset(dest2, 0, TEST_LEN);
memset(dest3, 0, TEST_LEN);
memset(dest4, 0, TEST_LEN);
memset(dest5, 0, TEST_LEN);
memset(dest_ref1, 0, TEST_LEN);
memset(dest_ref2, 0, TEST_LEN);
memset(dest_ref3, 0, TEST_LEN);
memset(dest_ref4, 0, TEST_LEN);
memset(dest_ref5, 0, TEST_LEN);
memset(g1, 2, TEST_SOURCES);
memset(g2, 1, TEST_SOURCES);
memset(g3, 7, TEST_SOURCES);
memset(g4, 9, TEST_SOURCES);
memset(g5, 4, TEST_SOURCES);
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]);
gf_vect_mul_init(g5[i], &g_tbls[128 * TEST_SOURCES + i * 32]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
dest_ref4);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
dest_ref5);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest4, 25);
return -1;
}
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test5\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, 25);
printf("dprod_dut:");
dump(dest5, 25);
return -1;
}
putchar('.');
// Rand data test
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
g5[i] = rand();
}
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
buffs, dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
buffs, dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
buffs, dest_ref4);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
buffs, dest_ref5);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest4, 25);
return -1;
}
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, 25);
printf("dprod_dut:");
dump(dest5, 25);
return -1;
}
putchar('.');
}
// Rand data test with varied parameters
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
for (i = 0; i < srcs; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
g5[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs,
dest_ref4);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[128 * srcs], buffs,
dest_ref5);
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test1 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test2 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test3 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test4 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest4, 25);
return -1;
}
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test5 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, 25);
printf("dprod_dut:");
dump(dest5, 25);
return -1;
}
putchar('.');
}
}
// Run tests at end of buffer for Electric Fence
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
efence_buffs[i] = buffs[i] + TEST_LEN - size;
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
g5[i] = rand();
}
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
}
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
efence_buffs, dest_ref2);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
efence_buffs, dest_ref3);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
efence_buffs, dest_ref4);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
efence_buffs, dest_ref5);
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, align);
printf("dprod_dut:");
dump(dest1, align);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, align);
printf("dprod_dut:");
dump(dest2, align);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, align);
printf("dprod_dut:");
dump(dest3, align);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, align);
printf("dprod_dut:");
dump(dest4, align);
return -1;
}
if (0 != memcmp(dest_ref5, dest5, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, align);
printf("dprod_dut:");
dump(dest5, align);
return -1;
}
putchar('.');
}
// Test rand ptr alignment if available
for (rtest = 0; rtest < RANDOMS; rtest++) {
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
srcs = rand() % TEST_SOURCES;
if (srcs == 0)
continue;
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
// Add random offsets
for (i = 0; i < srcs; i++)
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[4] = dest5 + (rand() & (PTR_ALIGN_CHK_B - offset));
memset(dest1, 0, TEST_LEN); // zero pad to check write-over
memset(dest2, 0, TEST_LEN);
memset(dest3, 0, TEST_LEN);
memset(dest4, 0, TEST_LEN);
memset(dest5, 0, TEST_LEN);
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
ubuffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
g5[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4);
gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], ubuffs, dest_ref5);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
if (memcmp(dest_ref1, udest_ptrs[0], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(udest_ptrs[0], 25);
return -1;
}
if (memcmp(dest_ref2, udest_ptrs[1], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(udest_ptrs[1], 25);
return -1;
}
if (memcmp(dest_ref3, udest_ptrs[2], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(udest_ptrs[2], 25);
return -1;
}
if (memcmp(dest_ref4, udest_ptrs[3], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(udest_ptrs[3], 25);
return -1;
}
if (memcmp(dest_ref5, udest_ptrs[4], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, 25);
printf("dprod_dut:");
dump(udest_ptrs[4], 25);
return -1;
}
// Confirm that padding around dests is unchanged
memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
offset = udest_ptrs[0] - dest1;
if (memcmp(dest1, dest_ref1, offset)) {
printf("Fail rand ualign pad1 start\n");
return -1;
}
if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad1 end\n");
return -1;
}
offset = udest_ptrs[1] - dest2;
if (memcmp(dest2, dest_ref1, offset)) {
printf("Fail rand ualign pad2 start\n");
return -1;
}
if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad2 end\n");
return -1;
}
offset = udest_ptrs[2] - dest3;
if (memcmp(dest3, dest_ref1, offset)) {
printf("Fail rand ualign pad3 start\n");
return -1;
}
if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad3 end\n");
return -1;
}
offset = udest_ptrs[3] - dest4;
if (memcmp(dest4, dest_ref1, offset)) {
printf("Fail rand ualign pad4 start\n");
return -1;
}
if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad4 end\n");
return -1;
}
offset = udest_ptrs[4] - dest5;
if (memcmp(dest5, dest_ref1, offset)) {
printf("Fail rand ualign pad5 start\n");
return -1;
}
if (memcmp(dest5 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad5 end\n");
return -1;
}
putchar('.');
}
// Test all size alignment
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
srcs = TEST_SOURCES;
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
g5[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4);
gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], buffs, dest_ref5);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
if (memcmp(dest_ref1, dest_ptrs[0], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest_ptrs[0], 25);
return -1;
}
if (memcmp(dest_ref2, dest_ptrs[1], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest_ptrs[1], 25);
return -1;
}
if (memcmp(dest_ref3, dest_ptrs[2], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest_ptrs[2], 25);
return -1;
}
if (memcmp(dest_ref4, dest_ptrs[3], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest_ptrs[3], 25);
return -1;
}
if (memcmp(dest_ref5, dest_ptrs[4], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, 25);
printf("dprod_dut:");
dump(dest_ptrs[4], 25);
return -1;
}
}
printf("Pass\n");
return 0;
}

View File

@ -0,0 +1,365 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define tmp2 r10
%define tmp3 r13
%define tmp4 r14
%define return rax
%define return.w eax
%define stack_size 16*10 + 5*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
movdqa [rsp+16*0],xmm6
movdqa [rsp+16*1],xmm7
movdqa [rsp+16*2],xmm8
movdqa [rsp+16*3],xmm9
movdqa [rsp+16*4],xmm10
movdqa [rsp+16*5],xmm11
movdqa [rsp+16*6],xmm12
movdqa [rsp+16*7],xmm13
movdqa [rsp+16*8],xmm14
movdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r14, 10*16 + 2*8
save_reg r15, 10*16 + 3*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp+16*0]
movdqa xmm7, [rsp+16*1]
movdqa xmm8, [rsp+16*2]
movdqa xmm9, [rsp+16*3]
movdqa xmm10, [rsp+16*4]
movdqa xmm11, [rsp+16*5]
movdqa xmm12, [rsp+16*6]
movdqa xmm13, [rsp+16*7]
movdqa xmm14, [rsp+16*8]
movdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8]
mov r14, [rsp + 10*16 + 2*8]
mov r15, [rsp + 10*16 + 3*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r12
%define tmp4 r13
%define return rax
%define return.w eax
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
%endmacro
%macro FUNC_RESTORE 0
pop r13
pop r12
%endmacro
%endif
;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 tmp4
%define dest3 mul_array
%define dest4 tmp2
%define dest5 vec_i
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft5_hi xmm14
%define xgft4_lo xmm13
%define xgft4_hi xmm12
%define x0 xmm0
%define xtmpa xmm1
%define xtmph1 xmm2
%define xtmpl1 xmm3
%define xtmph2 xmm4
%define xtmpl2 xmm5
%define xtmph3 xmm6
%define xtmpl3 xmm7
%define xtmph5 xmm8
%define xtmpl5 xmm9
%define xd1 xmm10
%define xd2 xmm11
%define xd3 xtmpl1
%define xd4 xtmph1
%define xd5 xtmpl2
align 16
global gf_5vect_mad_avx:function
func(gf_5vect_mad_avx)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
mov tmp, vec
sal vec_i, 5 ;Multiply by 32
lea tmp3, [mul_array + vec_i]
sal tmp, 6 ;Multiply by 64
vmovdqu xgft5_hi, [tmp3+2*tmp+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
sal vec, 5 ;Multiply by 32
add tmp, vec
vmovdqu xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
vmovdqu xgft4_lo, [tmp3+tmp] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
mov dest3, [dest1+2*PS] ; reuse mul_array
mov dest4, [dest1+3*PS]
mov dest5, [dest1+4*PS] ; reuse vec_i
mov dest2, [dest1+PS]
mov dest1, [dest1]
.loop16:
XLDR x0, [src+pos] ;Get next source vector
vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
vmovdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
vmovdqu xtmpl5, [tmp3+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
XLDR xd1, [dest1+pos] ;Get next dest vector
XLDR xd2, [dest2+pos] ;Get next dest vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
; dest1
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xtmpl1, xtmpl1, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
vpxor xd1, xd1, xtmph1
XLDR xd3, [dest3+pos] ;Reuse xtmpl1, Get next dest vector
XLDR xd4, [dest4+pos] ;Reuse xtmph1, Get next dest vector
; dest2
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xtmpl2, xtmpl2, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
vpxor xd2, xd2, xtmph2
XLDR xd5, [dest5+pos] ;Reuse xtmpl2. Get next dest vector
; dest3
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
vpshufb xtmpl3, xtmpl3, xtmpa ;Lookup mul table of low nibble
vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials
vpxor xd3, xd3, xtmph3
; dest4
vpshufb xtmph2, xgft4_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl3, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl3 ;GF add high and low partials
vpxor xd4, xd4, xtmph2
; dest5
vpshufb xtmph5, xgft5_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl5, xtmpl5, xtmpa ;Lookup mul table of low nibble
vpxor xtmph5, xtmph5, xtmpl5 ;GF add high and low partials
vpxor xd5, xd5, xtmph5
XSTR [dest1+pos], xd1 ;Store result into dest1
XSTR [dest2+pos], xd2 ;Store result into dest2
XSTR [dest3+pos], xd3 ;Store result into dest3
XSTR [dest4+pos], xd4 ;Store result into dest4
XSTR [dest5+pos], xd5 ;Store result into dest5
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
.lessthan16:
;; Tail len
;; Do one more overlap pass
mov tmp, len ;Overlapped offset length-16
XLDR x0, [src+tmp] ;Get next source vector
sub len, pos
vmovdqa xtmph1, [constip16] ;Load const of i + 16
vpinsrb xtmph5, len.w, 15
vpshufb xtmph5, xmask0f ;Broadcast len to all bytes
vpcmpgtb xtmph5, xtmph5, xtmph1
vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
vmovdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
vmovdqu xtmpl5, [tmp3+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
XLDR xd1, [dest1+tmp] ;Get next dest vector
XLDR xd2, [dest2+tmp] ;Get next dest vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
; dest1
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xtmpl1, xtmpl1, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
vpand xtmph1, xtmph1, xtmph5
vpxor xd1, xd1, xtmph1
XLDR xd3, [dest3+tmp] ;Reuse xtmpl1, Get next dest vector
XLDR xd4, [dest4+tmp] ;Reuse xtmph1, Get next dest vector
; dest2
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xtmpl2, xtmpl2, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
vpand xtmph2, xtmph2, xtmph5
vpxor xd2, xd2, xtmph2
XLDR xd5, [dest5+tmp] ;Reuse xtmpl2. Get next dest vector
; dest3
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
vpshufb xtmpl3, xtmpl3, xtmpa ;Lookup mul table of low nibble
vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials
vpand xtmph3, xtmph3, xtmph5
vpxor xd3, xd3, xtmph3
; dest4
vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials
vpand xgft4_hi, xgft4_hi, xtmph5
vpxor xd4, xd4, xgft4_hi
; dest5
vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl5, xtmpl5, xtmpa ;Lookup mul table of low nibble
vpxor xgft5_hi, xgft5_hi, xtmpl5 ;GF add high and low partials
vpand xgft5_hi, xgft5_hi, xtmph5
vpxor xd5, xd5, xgft5_hi
XSTR [dest1+tmp], xd1 ;Store result into dest1
XSTR [dest2+tmp], xd2 ;Store result into dest2
XSTR [dest3+tmp], xd3 ;Store result into dest3
XSTR [dest4+tmp], xd4 ;Store result into dest4
XSTR [dest5+tmp], xd5 ;Store result into dest5
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
constip16:
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
;;; func core, ver, snum
slversion gf_5vect_mad_avx, 02, 01, 020d

View File

@ -0,0 +1,363 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define return rax
%define return.w eax
%define stack_size 16*10 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
movdqa [rsp+16*0],xmm6
movdqa [rsp+16*1],xmm7
movdqa [rsp+16*2],xmm8
movdqa [rsp+16*3],xmm9
movdqa [rsp+16*4],xmm10
movdqa [rsp+16*5],xmm11
movdqa [rsp+16*6],xmm12
movdqa [rsp+16*7],xmm13
movdqa [rsp+16*8],xmm14
movdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8
save_reg r15, 10*16 + 1*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp+16*0]
movdqa xmm7, [rsp+16*1]
movdqa xmm8, [rsp+16*2]
movdqa xmm9, [rsp+16*3]
movdqa xmm10, [rsp+16*4]
movdqa xmm11, [rsp+16*5]
movdqa xmm12, [rsp+16*6]
movdqa xmm13, [rsp+16*7]
movdqa xmm14, [rsp+16*8]
movdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8]
mov r15, [rsp + 10*16 + 1*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define return rax
%define return.w eax
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 tmp2
%define dest3 mul_array
%define dest4 vec
%define dest5 vec_i
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f ymm15
%define xmask0fx xmm15
%define xgft1_lo ymm14
%define xgft2_lo ymm13
%define xgft3_lo ymm12
%define xgft4_lo ymm11
%define xgft5_lo ymm10
%define x0 ymm0
%define xtmpa ymm1
%define xtmpl ymm2
%define xtmplx xmm2
%define xtmph1 ymm3
%define xtmph1x xmm3
%define xtmph2 ymm4
%define xd1 ymm5
%define xd2 ymm6
%define xd3 ymm7
%define xd4 ymm8
%define xd5 ymm9
align 16
global gf_5vect_mad_avx2:function
func(gf_5vect_mad_avx2)
FUNC_SAVE
sub len, 32
jl .return_fail
xor pos, pos
mov tmp.b, 0x0f
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
sal vec_i, 5 ;Multiply by 32
sal vec, 5 ;Multiply by 32
lea tmp, [mul_array + vec_i]
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
; " Ax{00}, Ax{10}, ..., Ax{f0}
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
; " Cx{00}, Cx{10}, ..., Cx{f0}
vmovdqu xgft5_lo, [tmp+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
; " Ex{00}, Ex{10}, ..., Ex{f0}
add tmp, vec
vmovdqu xgft4_lo, [tmp+2*vec] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
; " Dx{00}, Dx{10}, ..., Dx{f0}
mov dest3, [dest1+2*PS] ; reuse mul_array
mov dest4, [dest1+3*PS] ; reuse vec
mov dest5, [dest1+4*PS] ; reuse vec_i
mov dest2, [dest1+PS]
mov dest1, [dest1]
.loop32:
XLDR x0, [src+pos] ;Get next source vector
XLDR xd1, [dest1+pos] ;Get next dest vector
XLDR xd2, [dest2+pos] ;Get next dest vector
XLDR xd3, [dest3+pos] ;Get next dest vector
XLDR xd4, [dest4+pos] ;Get next dest vector
XLDR xd5, [dest5+pos] ;Get next dest vector
vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
; dest1
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials
vpxor xd1, xd1, xtmph1 ;xd1 += partial
vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
; dest2
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials
vpxor xd2, xd2, xtmph2 ;xd2 += partial
vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
; dest3
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials
vpxor xd3, xd3, xtmph1 ;xd3 += partial
vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
; dest4
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials
vpxor xd4, xd4, xtmph2 ;xd4 += partial
; dest5
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft5_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials
vpxor xd5, xd5, xtmph1 ;xd5 += partial
XSTR [dest1+pos], xd1
XSTR [dest2+pos], xd2
XSTR [dest3+pos], xd3
XSTR [dest4+pos], xd4
XSTR [dest5+pos], xd5
add pos, 32 ;Loop on 32 bytes at a time
cmp pos, len
jle .loop32
lea tmp, [len + 32]
cmp pos, tmp
je .return_pass
.lessthan32:
;; Tail len
;; Do one more overlap pass
mov tmp.b, 0x1f
vpinsrb xtmph1x, xtmph1x, tmp.w, 0
vpbroadcastb xtmph1, xtmph1x ;Construct mask 0x1f1f1f...
mov tmp, len ;Overlapped offset length-32
XLDR x0, [src+tmp] ;Get next source vector
XLDR xd1, [dest1+tmp] ;Get next dest vector
XLDR xd2, [dest2+tmp] ;Get next dest vector
XLDR xd3, [dest3+tmp] ;Get next dest vector
XLDR xd4, [dest4+tmp] ;Get next dest vector
XLDR xd5, [dest5+tmp] ;Get next dest vector
sub len, pos
vmovdqa xtmph2, [constip32] ;Load const of i + 32
vpinsrb xtmplx, xtmplx, len.w, 15
vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f...
vpcmpgtb xtmpl, xtmpl, xtmph2
vpand xtmph1, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xtmph1, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xtmph1, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
; dest1
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xgft1_lo ;GF add high and low partials
vpand xtmph1, xtmph1, xtmpl
vpxor xd1, xd1, xtmph1 ;xd1 += partial
vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
; dest2
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xgft2_lo ;GF add high and low partials
vpand xtmph2, xtmph2, xtmpl
vpxor xd2, xd2, xtmph2 ;xd2 += partial
vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
; dest3
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xgft3_lo ;GF add high and low partials
vpand xtmph1, xtmph1, xtmpl
vpxor xd3, xd3, xtmph1 ;xd3 += partial
vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
; dest4
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xgft4_lo ;GF add high and low partials
vpand xtmph2, xtmph2, xtmpl
vpxor xd4, xd4, xtmph2 ;xd4 += partial
; dest5
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xgft5_lo ;GF add high and low partials
vpand xtmph1, xtmph1, xtmpl
vpxor xd5, xd5, xtmph1 ;xd5 += partial
XSTR [dest1+tmp], xd1
XSTR [dest2+tmp], xd2
XSTR [dest3+tmp], xd3
XSTR [dest4+tmp], xd4
XSTR [dest5+tmp], xd5
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 32
constip32:
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
;;; func core, ver, snum
slversion gf_5vect_mad_avx2, 04, 01, 020e

View File

@ -0,0 +1,373 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_5vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define tmp2 r10
%define tmp3 r13
%define tmp4 r14
%define return rax
%define return.w eax
%define stack_size 16*10 + 5*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
movdqa [rsp+16*0],xmm6
movdqa [rsp+16*1],xmm7
movdqa [rsp+16*2],xmm8
movdqa [rsp+16*3],xmm9
movdqa [rsp+16*4],xmm10
movdqa [rsp+16*5],xmm11
movdqa [rsp+16*6],xmm12
movdqa [rsp+16*7],xmm13
movdqa [rsp+16*8],xmm14
movdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r14, 10*16 + 2*8
save_reg r15, 10*16 + 3*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp+16*0]
movdqa xmm7, [rsp+16*1]
movdqa xmm8, [rsp+16*2]
movdqa xmm9, [rsp+16*3]
movdqa xmm10, [rsp+16*4]
movdqa xmm11, [rsp+16*5]
movdqa xmm12, [rsp+16*6]
movdqa xmm13, [rsp+16*7]
movdqa xmm14, [rsp+16*8]
movdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8]
mov r14, [rsp + 10*16 + 2*8]
mov r15, [rsp + 10*16 + 3*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r12
%define tmp4 r13
%define return rax
%define return.w eax
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
%endmacro
%macro FUNC_RESTORE 0
pop r13
pop r12
%endmacro
%endif
;;; gf_5vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 tmp4
%define dest3 mul_array
%define dest4 tmp2
%define dest5 vec_i
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR movdqu
%define XSTR movdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR movdqa
%define XSTR movdqa
%else
%define XLDR movntdqa
%define XSTR movntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft5_hi xmm14
%define xgft4_lo xmm13
%define xgft4_hi xmm12
%define x0 xmm0
%define xtmpa xmm1
%define xtmph1 xmm2
%define xtmpl1 xmm3
%define xtmph2 xmm4
%define xtmpl2 xmm5
%define xtmph3 xmm6
%define xtmpl3 xmm7
%define xtmph5 xmm8
%define xtmpl5 xmm9
%define xd1 xmm10
%define xd2 xmm11
%define xd3 xtmpl1
%define xd4 xtmph1
%define xd5 xtmpl2
align 16
global gf_5vect_mad_sse:function
func(gf_5vect_mad_sse)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
mov tmp, vec
sal vec_i, 5 ;Multiply by 32
lea tmp3, [mul_array + vec_i]
sal tmp, 6 ;Multiply by 64
movdqu xgft5_hi, [tmp3+2*tmp+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
sal vec, 5 ;Multiply by 32
add tmp, vec
movdqu xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
movdqu xgft4_lo, [tmp3+tmp] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
mov dest3, [dest1+2*PS] ; reuse mul_array
mov dest4, [dest1+3*PS]
mov dest5, [dest1+4*PS] ; reuse vec_i
mov dest2, [dest1+PS]
mov dest1, [dest1]
.loop16:
XLDR x0, [src+pos] ;Get next source vector
movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
movdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
movdqu xtmpl5, [tmp3+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
movdqa xtmph5, xgft5_hi ;Reload const array registers
XLDR xd1, [dest1+pos] ;Get next dest vector
XLDR xd2, [dest2+pos] ;Get next dest vector
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
; dest1
pshufb xtmph1, x0 ;Lookup mul table of high nibble
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
pxor xtmph1, xtmpl1 ;GF add high and low partials
pxor xd1, xtmph1
XLDR xd3, [dest3+pos] ;Reuse xtmpl1, Get next dest vector
XLDR xd4, [dest4+pos] ;Reuse xtmph1. Get next dest vector
; dest2
pshufb xtmph2, x0 ;Lookup mul table of high nibble
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
pxor xtmph2, xtmpl2 ;GF add high and low partials
pxor xd2, xtmph2
XLDR xd5, [dest5+pos] ;Reuse xtmpl2. Get next dest vector
; dest3
pshufb xtmph3, x0 ;Lookup mul table of high nibble
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
pxor xtmph3, xtmpl3 ;GF add high and low partials
pxor xd3, xtmph3
movdqa xtmph2, xgft4_hi ;Reload const array registers
movdqa xtmpl3, xgft4_lo ;Reload const array registers
; dest5
pshufb xtmph5, x0 ;Lookup mul table of high nibble
pshufb xtmpl5, xtmpa ;Lookup mul table of low nibble
pxor xtmph5, xtmpl5 ;GF add high and low partials
pxor xd5, xtmph5
; dest4
pshufb xtmph2, x0 ;Lookup mul table of high nibble
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
pxor xtmph2, xtmpl3 ;GF add high and low partials
pxor xd4, xtmph2
XSTR [dest1+pos], xd1 ;Store result into dest1
XSTR [dest2+pos], xd2 ;Store result into dest2
XSTR [dest3+pos], xd3 ;Store result into dest3
XSTR [dest4+pos], xd4 ;Store result into dest4
XSTR [dest5+pos], xd5 ;Store result into dest5
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
.lessthan16:
;; Tail len
;; Do one more overlap pass
mov tmp, len ;Overlapped offset length-16
XLDR x0, [src+tmp] ;Get next source vector
sub len, pos
movdqa xtmpl1, [constip16] ;Load const of i + 16
pinsrb xtmph5, len.w, 15
pshufb xtmph5, xmask0f ;Broadcast len to all bytes
pcmpgtb xtmph5, xtmpl1
movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
movdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
movdqu xtmpl5, [tmp3+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
XLDR xd1, [dest1+tmp] ;Get next dest vector
XLDR xd2, [dest2+tmp] ;Get next dest vector
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
; dest1
pshufb xtmph1, x0 ;Lookup mul table of high nibble
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
pxor xtmph1, xtmpl1 ;GF add high and low partials
pand xtmph1, xtmph5
pxor xd1, xtmph1
XLDR xd3, [dest3+tmp] ;Reuse xtmpl1, Get next dest vector
XLDR xd4, [dest4+tmp] ;Reuse xtmph1. Get next dest vector
; dest2
pshufb xtmph2, x0 ;Lookup mul table of high nibble
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
pxor xtmph2, xtmpl2 ;GF add high and low partials
pand xtmph2, xtmph5
pxor xd2, xtmph2
XLDR xd5, [dest5+tmp] ;Reuse xtmpl2. Get next dest vector
; dest3
pshufb xtmph3, x0 ;Lookup mul table of high nibble
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
pxor xtmph3, xtmpl3 ;GF add high and low partials
pand xtmph3, xtmph5
pxor xd3, xtmph3
; dest4
pshufb xgft4_hi, x0 ;Lookup mul table of high nibble
pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft4_hi, xgft4_lo ;GF add high and low partials
pand xgft4_hi, xtmph5
pxor xd4, xgft4_hi
; dest5
pshufb xgft5_hi, x0 ;Lookup mul table of high nibble
pshufb xtmpl5, xtmpa ;Lookup mul table of low nibble
pxor xgft5_hi, xtmpl5 ;GF add high and low partials
pand xgft5_hi, xtmph5
pxor xd5, xgft5_hi
XSTR [dest1+tmp], xd1 ;Store result into dest1
XSTR [dest2+tmp], xd2 ;Store result into dest2
XSTR [dest3+tmp], xd3 ;Store result into dest3
XSTR [dest4+tmp], xd4 ;Store result into dest4
XSTR [dest5+tmp], xd5 ;Store result into dest5
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 16
mask0f:
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
constip16:
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
;;; func core, ver, snum
slversion gf_5vect_mad_sse, 00, 01, 020c

View File

@ -0,0 +1,315 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_6vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
%define tmp5 r14 ; must be saved and restored
%define tmp6 r15 ; must be saved and restored
%define return rax
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
push r14
push r15
%endmacro
%macro FUNC_RESTORE 0
pop r15
pop r14
pop r13
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define tmp5 rdi ; must be saved and restored
%define tmp6 rsi ; must be saved and restored
%define return rax
%define PS 8
%define LOG_PS 3
%define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
save_xmm128 xmm6, 0*16
save_xmm128 xmm7, 1*16
save_xmm128 xmm8, 2*16
save_xmm128 xmm9, 3*16
save_xmm128 xmm10, 4*16
save_xmm128 xmm11, 5*16
save_xmm128 xmm12, 6*16
save_xmm128 xmm13, 7*16
save_xmm128 xmm14, 8*16
save_xmm128 xmm15, 9*16
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r14, 10*16 + 2*8
save_reg r15, 10*16 + 3*8
save_reg rdi, 10*16 + 4*8
save_reg rsi, 10*16 + 5*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp + 0*16]
vmovdqa xmm7, [rsp + 1*16]
vmovdqa xmm8, [rsp + 2*16]
vmovdqa xmm9, [rsp + 3*16]
vmovdqa xmm10, [rsp + 4*16]
vmovdqa xmm11, [rsp + 5*16]
vmovdqa xmm12, [rsp + 6*16]
vmovdqa xmm13, [rsp + 7*16]
vmovdqa xmm14, [rsp + 8*16]
vmovdqa xmm15, [rsp + 9*16]
mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8]
mov r14, [rsp + 10*16 + 2*8]
mov r15, [rsp + 10*16 + 3*8]
mov rdi, [rsp + 10*16 + 4*8]
mov rsi, [rsp + 10*16 + 5*8]
add rsp, stack_size
%endmacro
%endif
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest arg4
%define ptr arg5
%define vec_i tmp2
%define dest1 tmp3
%define dest2 tmp4
%define vskip1 tmp5
%define vskip3 tmp6
%define pos return
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft1_lo xmm14
%define xgft1_hi xmm13
%define xgft2_lo xmm12
%define xgft2_hi xmm11
%define xgft3_lo xmm10
%define xgft3_hi xmm9
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm2
%define xp2 xmm3
%define xp3 xmm4
%define xp4 xmm5
%define xp5 xmm6
%define xp6 xmm7
align 16
global gf_6vect_dot_prod_avx:function
func(gf_6vect_dot_prod_avx)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
mov vskip1, vec
imul vskip1, 32
mov vskip3, vec
imul vskip3, 96
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
mov dest1, [dest]
mov dest2, [dest+PS]
.loop16:
mov tmp, mul_array
xor vec_i, vec_i
vpxor xp1, xp1
vpxor xp2, xp2
vpxor xp3, xp3
vpxor xp4, xp4
vpxor xp5, xp5
vpxor xp6, xp6
.next_vect:
mov ptr, [src+vec_i]
add vec_i, PS
XLDR x0, [ptr+pos] ;Get next source vector
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
vmovdqu xgft2_hi, [tmp+vskip1*1+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
vmovdqu xgft3_hi, [tmp+vskip1*2+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
lea ptr, [vskip1 + vskip1*4] ;ptr = vskip5
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp1, xgft1_hi ;xp1 += partial
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp2, xgft2_hi ;xp2 += partial
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
vpxor xp3, xgft3_hi ;xp3 += partial
vmovdqu xgft1_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
vmovdqu xgft1_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
vmovdqu xgft2_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
vmovdqu xgft2_hi, [tmp+vskip1*4+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
vmovdqu xgft3_lo, [tmp+ptr] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
vmovdqu xgft3_hi, [tmp+ptr+16] ; " Fx{00}, Fx{10}, ..., Fx{f0}
add tmp, 32
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp4, xgft1_hi ;xp4 += partial
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp5, xgft2_hi ;xp5 += partial
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
vpxor xp6, xgft3_hi ;xp6 += partial
cmp vec_i, vec
jl .next_vect
mov tmp, [dest+2*PS]
mov ptr, [dest+3*PS]
mov vec_i, [dest+4*PS]
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
XSTR [tmp+pos], xp3
mov tmp, [dest+5*PS]
XSTR [ptr+pos], xp4
XSTR [vec_i+pos], xp5
XSTR [tmp+pos], xp6
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop16 ;Do one more overlap pass
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_6vect_dot_prod_avx, 02, 04, 0195

View File

@ -0,0 +1,326 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_6vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
%define tmp5 r14 ; must be saved and restored
%define tmp6 r15 ; must be saved and restored
%define return rax
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
push r14
push r15
%endmacro
%macro FUNC_RESTORE 0
pop r15
pop r14
pop r13
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define tmp5 rdi ; must be saved and restored
%define tmp6 rsi ; must be saved and restored
%define return rax
%define PS 8
%define LOG_PS 3
%define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
vmovdqa [rsp + 0*16], xmm6
vmovdqa [rsp + 1*16], xmm7
vmovdqa [rsp + 2*16], xmm8
vmovdqa [rsp + 3*16], xmm9
vmovdqa [rsp + 4*16], xmm10
vmovdqa [rsp + 5*16], xmm11
vmovdqa [rsp + 6*16], xmm12
vmovdqa [rsp + 7*16], xmm13
vmovdqa [rsp + 8*16], xmm14
vmovdqa [rsp + 9*16], xmm15
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r14, 10*16 + 2*8
save_reg r15, 10*16 + 3*8
save_reg rdi, 10*16 + 4*8
save_reg rsi, 10*16 + 5*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp + 0*16]
vmovdqa xmm7, [rsp + 1*16]
vmovdqa xmm8, [rsp + 2*16]
vmovdqa xmm9, [rsp + 3*16]
vmovdqa xmm10, [rsp + 4*16]
vmovdqa xmm11, [rsp + 5*16]
vmovdqa xmm12, [rsp + 6*16]
vmovdqa xmm13, [rsp + 7*16]
vmovdqa xmm14, [rsp + 8*16]
vmovdqa xmm15, [rsp + 9*16]
mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8]
mov r14, [rsp + 10*16 + 2*8]
mov r15, [rsp + 10*16 + 3*8]
mov rdi, [rsp + 10*16 + 4*8]
mov rsi, [rsp + 10*16 + 5*8]
add rsp, stack_size
%endmacro
%endif
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest arg4
%define ptr arg5
%define vec_i tmp2
%define dest1 tmp3
%define dest2 tmp4
%define vskip1 tmp5
%define vskip3 tmp6
%define pos return
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f ymm15
%define xmask0fx xmm15
%define xgft1_lo ymm14
%define xgft1_hi ymm13
%define xgft2_lo ymm12
%define xgft2_hi ymm11
%define xgft3_lo ymm10
%define xgft3_hi ymm9
%define x0 ymm0
%define xtmpa ymm1
%define xp1 ymm2
%define xp2 ymm3
%define xp3 ymm4
%define xp4 ymm5
%define xp5 ymm6
%define xp6 ymm7
align 16
global gf_6vect_dot_prod_avx2:function
func(gf_6vect_dot_prod_avx2)
FUNC_SAVE
sub len, 32
jl .return_fail
xor pos, pos
mov tmp.b, 0x0f
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
mov vskip1, vec
imul vskip1, 32
mov vskip3, vec
imul vskip3, 96
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
mov dest1, [dest]
mov dest2, [dest+PS]
.loop32:
mov tmp, mul_array
xor vec_i, vec_i
vpxor xp1, xp1
vpxor xp2, xp2
vpxor xp3, xp3
vpxor xp4, xp4
vpxor xp5, xp5
vpxor xp6, xp6
.next_vect:
mov ptr, [src+vec_i]
XLDR x0, [ptr+pos] ;Get next source vector
add vec_i, PS
vpand xgft3_lo, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xgft3_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xgft3_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
; " Ax{00}, Ax{10}, ..., Ax{f0}
vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
; " Cx{00}, Cx{10}, ..., Cx{f0}
lea ptr, [vskip1 + vskip1*4] ;ptr = vskip5
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp1, xgft1_hi ;xp1 += partial
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp2, xgft2_hi ;xp2 += partial
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
vpxor xp3, xgft3_hi ;xp3 += partial
vmovdqu xgft1_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
; " Dx{00}, Dx{10}, ..., Dx{f0}
vmovdqu xgft2_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
; " Ex{00}, Ex{10}, ..., Ex{f0}
vmovdqu xgft3_lo, [tmp+ptr] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
; " Fx{00}, Fx{10}, ..., Fx{f0}
add tmp, 32
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
vpxor xp4, xgft1_hi ;xp4 += partial
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
vpxor xp5, xgft2_hi ;xp5 += partial
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
vpxor xp6, xgft3_hi ;xp6 += partial
cmp vec_i, vec
jl .next_vect
mov tmp, [dest+2*PS]
mov ptr, [dest+3*PS]
mov vec_i, [dest+4*PS]
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
XSTR [tmp+pos], xp3
mov tmp, [dest+5*PS]
XSTR [ptr+pos], xp4
XSTR [vec_i+pos], xp5
XSTR [tmp+pos], xp6
add pos, 32 ;Loop on 32 bytes at a time
cmp pos, len
jle .loop32
lea tmp, [len + 32]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop32 ;Do one more overlap pass
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
;;; func core, ver, snum
slversion gf_6vect_dot_prod_avx2, 04, 04, 019a

View File

@ -0,0 +1,315 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_6vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r12 ; must be saved and restored
%define tmp5 r14 ; must be saved and restored
%define tmp6 r15 ; must be saved and restored
%define return rax
%define PS 8
%define LOG_PS 3
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
push r14
push r15
%endmacro
%macro FUNC_RESTORE 0
pop r15
pop r14
pop r13
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved, loaded and restored
%define arg5 r15 ; must be saved and restored
%define tmp r11
%define tmp2 r10
%define tmp3 r13 ; must be saved and restored
%define tmp4 r14 ; must be saved and restored
%define tmp5 rdi ; must be saved and restored
%define tmp6 rsi ; must be saved and restored
%define return rax
%define PS 8
%define LOG_PS 3
%define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
save_xmm128 xmm6, 0*16
save_xmm128 xmm7, 1*16
save_xmm128 xmm8, 2*16
save_xmm128 xmm9, 3*16
save_xmm128 xmm10, 4*16
save_xmm128 xmm11, 5*16
save_xmm128 xmm12, 6*16
save_xmm128 xmm13, 7*16
save_xmm128 xmm14, 8*16
save_xmm128 xmm15, 9*16
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r14, 10*16 + 2*8
save_reg r15, 10*16 + 3*8
save_reg rdi, 10*16 + 4*8
save_reg rsi, 10*16 + 5*8
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp + 0*16]
movdqa xmm7, [rsp + 1*16]
movdqa xmm8, [rsp + 2*16]
movdqa xmm9, [rsp + 3*16]
movdqa xmm10, [rsp + 4*16]
movdqa xmm11, [rsp + 5*16]
movdqa xmm12, [rsp + 6*16]
movdqa xmm13, [rsp + 7*16]
movdqa xmm14, [rsp + 8*16]
movdqa xmm15, [rsp + 9*16]
mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8]
mov r14, [rsp + 10*16 + 2*8]
mov r15, [rsp + 10*16 + 3*8]
mov rdi, [rsp + 10*16 + 4*8]
mov rsi, [rsp + 10*16 + 5*8]
add rsp, stack_size
%endmacro
%endif
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest arg4
%define ptr arg5
%define vec_i tmp2
%define dest1 tmp3
%define dest2 tmp4
%define vskip1 tmp5
%define vskip3 tmp6
%define pos return
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR movdqu
%define XSTR movdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR movdqa
%define XSTR movdqa
%else
%define XLDR movntdqa
%define XSTR movntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft1_lo xmm2
%define xgft1_hi xmm3
%define xgft2_lo xmm4
%define xgft2_hi xmm5
%define xgft3_lo xmm6
%define xgft3_hi xmm7
%define x0 xmm0
%define xtmpa xmm1
%define xp1 xmm8
%define xp2 xmm9
%define xp3 xmm10
%define xp4 xmm11
%define xp5 xmm12
%define xp6 xmm13
align 16
global gf_6vect_dot_prod_sse:function
func(gf_6vect_dot_prod_sse)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
mov vskip1, vec
imul vskip1, 32
mov vskip3, vec
imul vskip3, 96
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
mov dest1, [dest]
mov dest2, [dest+PS]
.loop16:
mov tmp, mul_array
xor vec_i, vec_i
pxor xp1, xp1
pxor xp2, xp2
pxor xp3, xp3
pxor xp4, xp4
pxor xp5, xp5
pxor xp6, xp6
.next_vect:
mov ptr, [src+vec_i]
add vec_i, PS
XLDR x0, [ptr+pos] ;Get next source vector
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
movdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
movdqu xgft2_hi, [tmp+vskip1*1+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
movdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
movdqu xgft3_hi, [tmp+vskip1*2+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
lea ptr, [vskip1 + vskip1*4] ;ptr = vskip5
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
pxor xp1, xgft1_hi ;xp1 += partial
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
pxor xp2, xgft2_hi ;xp2 += partial
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft3_hi, xgft3_lo ;GF add high and low partials
pxor xp3, xgft3_hi ;xp3 += partial
movdqu xgft1_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
movdqu xgft1_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
movdqu xgft2_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
movdqu xgft2_hi, [tmp+vskip1*4+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
movdqu xgft3_lo, [tmp+ptr] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
movdqu xgft3_hi, [tmp+ptr+16] ; " Fx{00}, Fx{10}, ..., Fx{f0}
add tmp, 32
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
pxor xp4, xgft1_hi ;xp4 += partial
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
pxor xp5, xgft2_hi ;xp5 += partial
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft3_hi, xgft3_lo ;GF add high and low partials
pxor xp6, xgft3_hi ;xp6 += partial
cmp vec_i, vec
jl .next_vect
mov tmp, [dest+2*PS]
mov ptr, [dest+3*PS]
mov vec_i, [dest+4*PS]
XSTR [dest1+pos], xp1
XSTR [dest2+pos], xp2
XSTR [tmp+pos], xp3
mov tmp, [dest+5*PS]
XSTR [ptr+pos], xp4
XSTR [vec_i+pos], xp5
XSTR [tmp+pos], xp6
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop16 ;Do one more overlap pass
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_6vect_dot_prod_sse, 00, 05, 0066

View File

@ -0,0 +1,352 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "test.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_6vect_dot_prod_sse
#endif
#define str(s) #s
#define xstr(s) str(s)
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 10
# define TEST_LEN 8*1024
# define TEST_LOOPS 40000
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 10
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
# define TEST_LOOPS 100
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS 1000
# endif
# endif
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j;
void *buf;
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
u8 g4[TEST_SOURCES], g5[TEST_SOURCES], g6[TEST_SOURCES], *g_tbls;
u8 *dest1, *dest2, *dest3, *dest4, *dest5, *dest6, *dest_ref1;
u8 *dest_ref2, *dest_ref3, *dest_ref4, *dest_ref5, *dest_ref6;
u8 *dest_ptrs[6], *buffs[TEST_SOURCES];
struct perf start, stop;
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 16, 6 * TEST_SOURCES * 32)) {
printf("alloc error: Fail");
return -1;
}
g_tbls = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest4 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest5 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest6 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref4 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref5 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref6 = buf;
dest_ptrs[0] = dest1;
dest_ptrs[1] = dest2;
dest_ptrs[2] = dest3;
dest_ptrs[3] = dest4;
dest_ptrs[4] = dest5;
dest_ptrs[5] = dest6;
// Performance test
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
memset(dest1, 0, TEST_LEN);
memset(dest2, 0, TEST_LEN);
memset(dest3, 0, TEST_LEN);
memset(dest4, 0, TEST_LEN);
memset(dest5, 0, TEST_LEN);
memset(dest6, 0, TEST_LEN);
memset(dest_ref1, 0, TEST_LEN);
memset(dest_ref2, 0, TEST_LEN);
memset(dest_ref3, 0, TEST_LEN);
memset(dest_ref4, 0, TEST_LEN);
memset(dest_ref5, 0, TEST_LEN);
memset(dest_ref6, 0, TEST_LEN);
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
g5[i] = rand();
g6[i] = rand();
}
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g6[j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
dest_ref4);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
dest_ref5);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES], buffs,
dest_ref6);
#ifdef DO_REF_PERF
perf_start(&start);
for (i = 0; i < TEST_LOOPS / 20; i++) {
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g6[j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
buffs, dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
buffs, dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
buffs, dest_ref4);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
buffs, dest_ref5);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES],
buffs, dest_ref6);
}
perf_stop(&stop);
printf("gf_6vect_dot_prod_base" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 6) * i);
#endif
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
for (j = 0; j < TEST_SOURCES; j++) {
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(g6[j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
}
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 6) * i);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test4\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest4, 25);
return -1;
}
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test5\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, 25);
printf("dprod_dut:");
dump(dest5, 25);
return -1;
}
if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test6\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref6, 25);
printf("dprod_dut:");
dump(dest6, 25);
return -1;
}
printf("pass perf check\n");
return 0;
}

View File

@ -0,0 +1,911 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_6vect_dot_prod_sse
#endif
#ifndef TEST_MIN_SIZE
# define TEST_MIN_SIZE 16
#endif
#define str(s) #s
#define xstr(s) str(s)
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#define TEST_MEM TEST_SIZE
#define TEST_LOOPS 20000
#define TEST_TYPE_STR ""
#ifndef TEST_SOURCES
# define TEST_SOURCES 16
#endif
#ifndef RANDOMS
# define RANDOMS 20
#endif
#ifdef EC_ALIGNED_ADDR
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 0
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
#else
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 32
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j, rtest, srcs;
void *buf;
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
u8 g4[TEST_SOURCES], g5[TEST_SOURCES], g6[TEST_SOURCES], *g_tbls;
u8 *dest1, *dest2, *dest3, *dest4, *dest5, *dest6, *dest_ref1;
u8 *dest_ref2, *dest_ref3, *dest_ref4, *dest_ref5, *dest_ref6;
u8 *dest_ptrs[6], *buffs[TEST_SOURCES];
int align, size;
unsigned char *efence_buffs[TEST_SOURCES];
unsigned int offset;
u8 *ubuffs[TEST_SOURCES];
u8 *udest_ptrs[6];
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 16, 2 * (6 * TEST_SOURCES * 32))) {
printf("alloc error: Fail");
return -1;
}
g_tbls = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest4 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest5 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest6 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref1 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref2 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref3 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref4 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref5 = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref6 = buf;
dest_ptrs[0] = dest1;
dest_ptrs[1] = dest2;
dest_ptrs[2] = dest3;
dest_ptrs[3] = dest4;
dest_ptrs[4] = dest5;
dest_ptrs[5] = dest6;
// Test of all zeros
for (i = 0; i < TEST_SOURCES; i++)
memset(buffs[i], 0, TEST_LEN);
memset(dest1, 0, TEST_LEN);
memset(dest2, 0, TEST_LEN);
memset(dest3, 0, TEST_LEN);
memset(dest4, 0, TEST_LEN);
memset(dest5, 0, TEST_LEN);
memset(dest6, 0, TEST_LEN);
memset(dest_ref1, 0, TEST_LEN);
memset(dest_ref2, 0, TEST_LEN);
memset(dest_ref3, 0, TEST_LEN);
memset(dest_ref4, 0, TEST_LEN);
memset(dest_ref5, 0, TEST_LEN);
memset(dest_ref6, 0, TEST_LEN);
memset(g1, 2, TEST_SOURCES);
memset(g2, 1, TEST_SOURCES);
memset(g3, 7, TEST_SOURCES);
memset(g4, 9, TEST_SOURCES);
memset(g5, 4, TEST_SOURCES);
memset(g6, 0xe6, TEST_SOURCES);
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]);
gf_vect_mul_init(g5[i], &g_tbls[128 * TEST_SOURCES + i * 32]);
gf_vect_mul_init(g6[i], &g_tbls[160 * TEST_SOURCES + i * 32]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
dest_ref4);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
dest_ref5);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES], buffs,
dest_ref6);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest4, 25);
return -1;
}
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test5\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, 25);
printf("dprod_dut:");
dump(dest5, 25);
return -1;
}
if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test6\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref6, 25);
printf("dprod_dut:");
dump(dest6, 25);
return -1;
}
putchar('.');
// Rand data test
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
g5[i] = rand();
g6[i] = rand();
}
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g6[i], &g_tbls[(160 * TEST_SOURCES) + (i * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
buffs, dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
buffs, dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
buffs, dest_ref4);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
buffs, dest_ref5);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES],
buffs, dest_ref6);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest4, 25);
return -1;
}
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, 25);
printf("dprod_dut:");
dump(dest5, 25);
return -1;
}
if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test6 %d\n", rtest);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref6, 25);
printf("dprod_dut:");
dump(dest6, 25);
return -1;
}
putchar('.');
}
// Rand data test with varied parameters
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
for (i = 0; i < srcs; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
g5[i] = rand();
g6[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
dest_ref2);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
dest_ref3);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs,
dest_ref4);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[128 * srcs], buffs,
dest_ref5);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[160 * srcs], buffs,
dest_ref6);
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test1 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest1, 25);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test2 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest2, 25);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test3 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest3, 25);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test4 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest4, 25);
return -1;
}
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test5 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, 25);
printf("dprod_dut:");
dump(dest5, 25);
return -1;
}
if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test6 srcs=%d\n", srcs);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref6, 25);
printf("dprod_dut:");
dump(dest6, 25);
return -1;
}
putchar('.');
}
}
// Run tests at end of buffer for Electric Fence
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
efence_buffs[i] = buffs[i] + TEST_LEN - size;
for (i = 0; i < TEST_SOURCES; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
g5[i] = rand();
g6[i] = rand();
}
for (i = 0; i < TEST_SOURCES; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
gf_vect_mul_init(g6[i], &g_tbls[(160 * TEST_SOURCES) + (i * 32)]);
}
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
efence_buffs, dest_ref2);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
efence_buffs, dest_ref3);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
efence_buffs, dest_ref4);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
efence_buffs, dest_ref5);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES],
efence_buffs, dest_ref6);
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
if (0 != memcmp(dest_ref1, dest1, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, align);
printf("dprod_dut:");
dump(dest1, align);
return -1;
}
if (0 != memcmp(dest_ref2, dest2, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, align);
printf("dprod_dut:");
dump(dest2, align);
return -1;
}
if (0 != memcmp(dest_ref3, dest3, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, align);
printf("dprod_dut:");
dump(dest3, align);
return -1;
}
if (0 != memcmp(dest_ref4, dest4, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, align);
printf("dprod_dut:");
dump(dest4, align);
return -1;
}
if (0 != memcmp(dest_ref5, dest5, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, align);
printf("dprod_dut:");
dump(dest5, align);
return -1;
}
if (0 != memcmp(dest_ref6, dest6, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test6 %d\n", rtest);
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref6, align);
printf("dprod_dut:");
dump(dest6, align);
return -1;
}
putchar('.');
}
// Test rand ptr alignment if available
for (rtest = 0; rtest < RANDOMS; rtest++) {
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
srcs = rand() % TEST_SOURCES;
if (srcs == 0)
continue;
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
// Add random offsets
for (i = 0; i < srcs; i++)
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[4] = dest5 + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptrs[5] = dest6 + (rand() & (PTR_ALIGN_CHK_B - offset));
memset(dest1, 0, TEST_LEN); // zero pad to check write-over
memset(dest2, 0, TEST_LEN);
memset(dest3, 0, TEST_LEN);
memset(dest4, 0, TEST_LEN);
memset(dest5, 0, TEST_LEN);
memset(dest6, 0, TEST_LEN);
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
ubuffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
g5[i] = rand();
g6[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4);
gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], ubuffs, dest_ref5);
gf_vect_dot_prod_base(size, srcs, &g_tbls[160 * srcs], ubuffs, dest_ref6);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
if (memcmp(dest_ref1, udest_ptrs[0], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(udest_ptrs[0], 25);
return -1;
}
if (memcmp(dest_ref2, udest_ptrs[1], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(udest_ptrs[1], 25);
return -1;
}
if (memcmp(dest_ref3, udest_ptrs[2], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(udest_ptrs[2], 25);
return -1;
}
if (memcmp(dest_ref4, udest_ptrs[3], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(udest_ptrs[3], 25);
return -1;
}
if (memcmp(dest_ref5, udest_ptrs[4], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, 25);
printf("dprod_dut:");
dump(udest_ptrs[4], 25);
return -1;
}
if (memcmp(dest_ref6, udest_ptrs[5], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref6, 25);
printf("dprod_dut:");
dump(udest_ptrs[5], 25);
return -1;
}
// Confirm that padding around dests is unchanged
memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
offset = udest_ptrs[0] - dest1;
if (memcmp(dest1, dest_ref1, offset)) {
printf("Fail rand ualign pad1 start\n");
return -1;
}
if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad1 end\n");
return -1;
}
offset = udest_ptrs[1] - dest2;
if (memcmp(dest2, dest_ref1, offset)) {
printf("Fail rand ualign pad2 start\n");
return -1;
}
if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad2 end\n");
return -1;
}
offset = udest_ptrs[2] - dest3;
if (memcmp(dest3, dest_ref1, offset)) {
printf("Fail rand ualign pad3 start\n");
return -1;
}
if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad3 end\n");
return -1;
}
offset = udest_ptrs[3] - dest4;
if (memcmp(dest4, dest_ref1, offset)) {
printf("Fail rand ualign pad4 start\n");
return -1;
}
if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad4 end\n");
return -1;
}
offset = udest_ptrs[4] - dest5;
if (memcmp(dest5, dest_ref1, offset)) {
printf("Fail rand ualign pad5 start\n");
return -1;
}
if (memcmp(dest5 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad5 end\n");
return -1;
}
offset = udest_ptrs[5] - dest6;
if (memcmp(dest6, dest_ref1, offset)) {
printf("Fail rand ualign pad6 start\n");
return -1;
}
if (memcmp(dest6 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad6 end\n");
return -1;
}
putchar('.');
}
// Test all size alignment
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
srcs = TEST_SOURCES;
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++) {
g1[i] = rand();
g2[i] = rand();
g3[i] = rand();
g4[i] = rand();
g5[i] = rand();
g6[i] = rand();
}
for (i = 0; i < srcs; i++) {
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]);
}
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4);
gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], buffs, dest_ref5);
gf_vect_dot_prod_base(size, srcs, &g_tbls[160 * srcs], buffs, dest_ref6);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
if (memcmp(dest_ref1, dest_ptrs[0], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref1, 25);
printf("dprod_dut:");
dump(dest_ptrs[0], 25);
return -1;
}
if (memcmp(dest_ref2, dest_ptrs[1], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref2, 25);
printf("dprod_dut:");
dump(dest_ptrs[1], 25);
return -1;
}
if (memcmp(dest_ref3, dest_ptrs[2], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref3, 25);
printf("dprod_dut:");
dump(dest_ptrs[2], 25);
return -1;
}
if (memcmp(dest_ref4, dest_ptrs[3], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref4, 25);
printf("dprod_dut:");
dump(dest_ptrs[3], 25);
return -1;
}
if (memcmp(dest_ref5, dest_ptrs[4], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref5, 25);
printf("dprod_dut:");
dump(dest_ptrs[4], 25);
return -1;
}
if (memcmp(dest_ref6, dest_ptrs[5], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref6, 25);
printf("dprod_dut:");
dump(dest_ptrs[5], 25);
return -1;
}
}
printf("Pass\n");
return 0;
}

View File

@ -0,0 +1,394 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_6vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define tmp2 r10
%define tmp3 r13
%define tmp4 r14
%define tmp5 rdi
%define return rax
%define return.w eax
%define stack_size 16*10 + 5*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
movdqa [rsp+16*0],xmm6
movdqa [rsp+16*1],xmm7
movdqa [rsp+16*2],xmm8
movdqa [rsp+16*3],xmm9
movdqa [rsp+16*4],xmm10
movdqa [rsp+16*5],xmm11
movdqa [rsp+16*6],xmm12
movdqa [rsp+16*7],xmm13
movdqa [rsp+16*8],xmm14
movdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r14, 10*16 + 2*8
save_reg r15, 10*16 + 3*8
save_reg rdi, 10*16 + 4*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp+16*0]
movdqa xmm7, [rsp+16*1]
movdqa xmm8, [rsp+16*2]
movdqa xmm9, [rsp+16*3]
movdqa xmm10, [rsp+16*4]
movdqa xmm11, [rsp+16*5]
movdqa xmm12, [rsp+16*6]
movdqa xmm13, [rsp+16*7]
movdqa xmm14, [rsp+16*8]
movdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8]
mov r14, [rsp + 10*16 + 2*8]
mov r15, [rsp + 10*16 + 3*8]
mov rdi, [rsp + 10*16 + 4*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp2 r10
%define tmp3 r12
%define tmp4 r13
%define tmp5 r14
%define return rax
%define return.w eax
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
push r14
%endmacro
%macro FUNC_RESTORE 0
pop r14
pop r13
pop r12
%endmacro
%endif
;;; gf_6vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 tmp4
%define dest3 tmp2
%define dest4 mul_array
%define dest5 tmp5
%define dest6 vec_i
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft4_lo xmm14
%define xgft4_hi xmm13
%define xgft5_lo xmm12
%define xgft5_hi xmm11
%define xgft6_lo xmm10
%define xgft6_hi xmm9
%define x0 xmm0
%define xtmpa xmm1
%define xtmph1 xmm2
%define xtmpl1 xmm3
%define xtmph2 xmm4
%define xtmpl2 xmm5
%define xtmph3 xmm6
%define xtmpl3 xmm7
%define xd1 xmm8
%define xd2 xtmpl1
%define xd3 xtmph1
align 16
global gf_6vect_mad_avx:function
func(gf_6vect_mad_avx)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
mov tmp, vec
sal vec_i, 5 ;Multiply by 32
lea tmp3, [mul_array + vec_i]
sal tmp, 6 ;Multiply by 64
sal vec, 5 ;Multiply by 32
lea vec_i, [tmp + vec] ;vec_i = vec*96
lea mul_array, [tmp + vec_i] ;mul_array = vec*160
vmovdqu xgft5_lo, [tmp3+2*tmp] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
vmovdqu xgft5_hi, [tmp3+2*tmp+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
vmovdqu xgft4_lo, [tmp3+vec_i] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
vmovdqu xgft4_hi, [tmp3+vec_i+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
vmovdqu xgft6_lo, [tmp3+mul_array] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
vmovdqu xgft6_hi, [tmp3+mul_array+16] ; " Fx{00}, Fx{10}, ..., Fx{f0}
mov dest2, [dest1+PS]
mov dest3, [dest1+2*PS]
mov dest4, [dest1+3*PS] ; reuse mul_array
mov dest5, [dest1+4*PS]
mov dest6, [dest1+5*PS] ; reuse vec_i
mov dest1, [dest1]
.loop16:
XLDR x0, [src+pos] ;Get next source vector
vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
vmovdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
XLDR xd1, [dest1+pos] ;Get next dest vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
;dest1
vpshufb xtmph1, x0 ;Lookup mul table of high nibble
vpshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmpl1 ;GF add high and low partials
vpxor xd1, xtmph1
XLDR xd2, [dest2+pos] ;reuse xtmpl1. Get next dest vector
XLDR xd3, [dest3+pos] ;reuse xtmph1. Get next dest vector
;dest2
vpshufb xtmph2, x0 ;Lookup mul table of high nibble
vpshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmpl2 ;GF add high and low partials
vpxor xd2, xtmph2
;dest3
vpshufb xtmph3, x0 ;Lookup mul table of high nibble
vpshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
vpxor xtmph3, xtmpl3 ;GF add high and low partials
vpxor xd3, xtmph3
XSTR [dest1+pos], xd1 ;Store result into dest1
XSTR [dest2+pos], xd2 ;Store result into dest2
XSTR [dest3+pos], xd3 ;Store result into dest3
;dest4
XLDR xd1, [dest4+pos] ;Get next dest vector
vpshufb xtmph1, xgft4_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl1, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
vpxor xd1, xd1, xtmph1
XLDR xd2, [dest5+pos] ;reuse xtmpl1. Get next dest vector
XLDR xd3, [dest6+pos] ;reuse xtmph1. Get next dest vector
;dest5
vpshufb xtmph2, xgft5_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl2, xgft5_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
vpxor xd2, xd2, xtmph2
;dest6
vpshufb xtmph3, xgft6_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl3, xgft6_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials
vpxor xd3, xd3, xtmph3
XSTR [dest4+pos], xd1 ;Store result into dest4
XSTR [dest5+pos], xd2 ;Store result into dest5
XSTR [dest6+pos], xd3 ;Store result into dest6
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
.lessthan16:
;; Tail len
;; Do one more overlap pass
;; Overlapped offset length-16
mov tmp, len ;Backup len as len=rdi
XLDR x0, [src+tmp] ;Get next source vector
XLDR xd1, [dest4+tmp] ;Get next dest vector
XLDR xd2, [dest5+tmp] ;reuse xtmpl1. Get next dest vector
XLDR xd3, [dest6+tmp] ;reuse xtmph1. Get next dest vector
sub len, pos
vmovdqa xtmph3, [constip16] ;Load const of i + 16
vpinsrb xtmpl3, len.w, 15
vpshufb xtmpl3, xmask0f ;Broadcast len to all bytes
vpcmpgtb xtmpl3, xtmpl3, xtmph3
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
;dest4
vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials
vpand xgft4_hi, xgft4_hi, xtmpl3
vpxor xd1, xd1, xgft4_hi
;dest5
vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft5_hi, xgft5_hi, xgft5_lo ;GF add high and low partials
vpand xgft5_hi, xgft5_hi, xtmpl3
vpxor xd2, xd2, xgft5_hi
;dest6
vpshufb xgft6_hi, xgft6_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft6_hi, xgft6_hi, xgft6_lo ;GF add high and low partials
vpand xgft6_hi, xgft6_hi, xtmpl3
vpxor xd3, xd3, xgft6_hi
XSTR [dest4+tmp], xd1 ;Store result into dest4
XSTR [dest5+tmp], xd2 ;Store result into dest5
XSTR [dest6+tmp], xd3 ;Store result into dest6
vmovdqu xgft4_lo, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
vmovdqu xgft4_hi, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
vmovdqu xgft5_lo, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
vmovdqu xgft5_hi, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
vmovdqu xgft6_lo, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
vmovdqu xgft6_hi, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
XLDR xd1, [dest1+tmp] ;Get next dest vector
XLDR xd2, [dest2+tmp] ;reuse xtmpl1. Get next dest vector
XLDR xd3, [dest3+tmp] ;reuse xtmph1. Get next dest3 vector
;dest1
vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials
vpand xgft4_hi, xgft4_hi, xtmpl3
vpxor xd1, xd1, xgft4_hi
;dest2
vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft5_hi, xgft5_hi, xgft5_lo ;GF add high and low partials
vpand xgft5_hi, xgft5_hi, xtmpl3
vpxor xd2, xd2, xgft5_hi
;dest3
vpshufb xgft6_hi, xgft6_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft6_hi, xgft6_hi, xgft6_lo ;GF add high and low partials
vpand xgft6_hi, xgft6_hi, xtmpl3
vpxor xd3, xd3, xgft6_hi
XSTR [dest1+tmp], xd1 ;Store result into dest1
XSTR [dest2+tmp], xd2 ;Store result into dest2
XSTR [dest3+tmp], xd3 ;Store result into dest3
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
constip16:
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
;;; func core, ver, snum
slversion gf_6vect_mad_avx, 02, 01, 0210

View File

@ -0,0 +1,400 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_6vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 r13
%define return rax
%define return.w eax
%define stack_size 16*10 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
movdqa [rsp+16*0],xmm6
movdqa [rsp+16*1],xmm7
movdqa [rsp+16*2],xmm8
movdqa [rsp+16*3],xmm9
movdqa [rsp+16*4],xmm10
movdqa [rsp+16*5],xmm11
movdqa [rsp+16*6],xmm12
movdqa [rsp+16*7],xmm13
movdqa [rsp+16*8],xmm14
movdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r15, 10*16 + 2*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp+16*0]
movdqa xmm7, [rsp+16*1]
movdqa xmm8, [rsp+16*2]
movdqa xmm9, [rsp+16*3]
movdqa xmm10, [rsp+16*4]
movdqa xmm11, [rsp+16*5]
movdqa xmm12, [rsp+16*6]
movdqa xmm13, [rsp+16*7]
movdqa xmm14, [rsp+16*8]
movdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8]
mov r15, [rsp + 10*16 + 2*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 r12
%define return rax
%define return.w eax
%define func(x) x:
%macro FUNC_SAVE 0
push r12
%endmacro
%macro FUNC_RESTORE 0
pop r12
%endmacro
%endif
;;; gf_6vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 tmp3
%define dest3 tmp2
%define dest4 mul_array
%define dest5 vec
%define dest6 vec_i
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f ymm15
%define xmask0fx xmm15
%define xgft1_lo ymm14
%define xgft2_lo ymm13
%define xgft3_lo ymm12
%define xgft4_lo ymm11
%define xgft5_lo ymm10
%define xgft6_lo ymm9
%define x0 ymm0
%define xtmpa ymm1
%define xtmpl ymm2
%define xtmplx xmm2
%define xtmph ymm3
%define xtmphx xmm3
%define xd1 ymm4
%define xd2 ymm5
%define xd3 ymm6
%define xd4 ymm7
%define xd5 ymm8
%define xd6 xd1
align 16
global gf_6vect_mad_avx2:function
func(gf_6vect_mad_avx2)
FUNC_SAVE
sub len, 32
jl .return_fail
xor pos, pos
mov tmp.b, 0x0f
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
sal vec_i, 5 ;Multiply by 32
sal vec, 5 ;Multiply by 32
lea tmp, [mul_array + vec_i]
mov vec_i, vec
mov mul_array, vec
sal vec_i, 1
sal mul_array, 1
add vec_i, vec ;vec_i=vec*96
add mul_array, vec_i ;vec_i=vec*160
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
; " Ax{00}, Ax{10}, ..., Ax{f0}
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
; " Bx{00}, Bx{10}, ..., Bx{f0}
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
; " Cx{00}, Cx{10}, ..., Cx{f0}
vmovdqu xgft4_lo, [tmp+vec_i] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
; " Fx{00}, Fx{10}, ..., Fx{f0}
vmovdqu xgft5_lo, [tmp+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
; " Ex{00}, Ex{10}, ..., Ex{f0}
vmovdqu xgft6_lo, [tmp+mul_array] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
; " Dx{00}, Dx{10}, ..., Dx{f0}
mov dest2, [dest1+PS] ; reuse tmp3
mov dest3, [dest1+2*PS] ; reuse tmp2
mov dest4, [dest1+3*PS] ; reuse mul_array
mov dest5, [dest1+4*PS] ; reuse vec
mov dest6, [dest1+5*PS] ; reuse vec_i
mov dest1, [dest1]
.loop32:
XLDR x0, [src+pos] ;Get next source vector
XLDR xd1, [dest1+pos] ;Get next dest vector
XLDR xd2, [dest2+pos] ;Get next dest vector
XLDR xd3, [dest3+pos] ;Get next dest vector
XLDR xd4, [dest4+pos] ;Get next dest vector
XLDR xd5, [dest5+pos] ;Get next dest vector
vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
;dest1
vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
vpxor xd1, xd1, xtmph ;xd1 += partial
XSTR [dest1+pos], xd1 ;Store result into dest1
;dest2
vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
vpxor xd2, xd2, xtmph ;xd2 += partial
;dest3
vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
vpxor xd3, xd3, xtmph ;xd3 += partial
XLDR xd6, [dest6+pos] ;reuse xd1. Get next dest vector
;dest4
vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
vpxor xd4, xd4, xtmph ;xd4 += partial
;dest5
vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft5_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
vpxor xd5, xd5, xtmph ;xd5 += partial
;dest6
vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft6_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
vpxor xd6, xd6, xtmph ;xd6 += partial
XSTR [dest2+pos], xd2 ;Store result into dest2
XSTR [dest3+pos], xd3 ;Store result into dest3
XSTR [dest4+pos], xd4 ;Store result into dest4
XSTR [dest5+pos], xd5 ;Store result into dest5
XSTR [dest6+pos], xd6 ;Store result into dest6
add pos, 32 ;Loop on 32 bytes at a time
cmp pos, len
jle .loop32
lea tmp, [len + 32]
cmp pos, tmp
je .return_pass
.lessthan32:
;; Tail len
;; Do one more overlap pass
mov tmp.b, 0x1f
vpinsrb xtmphx, xtmphx, tmp.w, 0
vpbroadcastb xtmph, xtmphx ;Construct mask 0x1f1f1f...
mov tmp, len ;Overlapped offset length-32
XLDR x0, [src+tmp] ;Get next source vector
XLDR xd1, [dest1+tmp] ;Get next dest vector
XLDR xd2, [dest2+tmp] ;Get next dest vector
XLDR xd3, [dest3+tmp] ;Get next dest vector
XLDR xd4, [dest4+tmp] ;Get next dest vector
XLDR xd5, [dest5+tmp] ;Get next dest vector
sub len, pos
vpinsrb xtmplx, xtmplx, len.w, 15
vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
vpshufb xtmpl, xtmpl, xtmph ;Broadcast len to all bytes. xtmph=0x1f1f1f...
vpcmpgtb xtmpl, xtmpl, [constip32]
vpand xtmph, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vperm2i128 xtmpa, xtmph, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
vperm2i128 x0, xtmph, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
;dest1
vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xgft1_lo ;GF add high and low partials
vpand xtmph, xtmph, xtmpl
vpxor xd1, xd1, xtmph ;xd1 += partial
XSTR [dest1+tmp], xd1 ;Store result into dest1
;dest2
vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xgft2_lo ;GF add high and low partials
vpand xtmph, xtmph, xtmpl
vpxor xd2, xd2, xtmph ;xd2 += partial
;dest3
vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xgft3_lo ;GF add high and low partials
vpand xtmph, xtmph, xtmpl
vpxor xd3, xd3, xtmph ;xd3 += partial
XLDR xd6, [dest6+tmp] ;reuse xd1. Get next dest vector
;dest4
vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xgft4_lo ;GF add high and low partials
vpand xtmph, xtmph, xtmpl
vpxor xd4, xd4, xtmph ;xd4 += partial
;dest5
vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xgft5_lo ;GF add high and low partials
vpand xtmph, xtmph, xtmpl
vpxor xd5, xd5, xtmph ;xd5 += partial
;dest6
vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xgft6_lo ;GF add high and low partials
vpand xtmph, xtmph, xtmpl
vpxor xd6, xd6, xtmph ;xd6 += partial
XSTR [dest2+tmp], xd2 ;Store result into dest2
XSTR [dest3+tmp], xd3 ;Store result into dest3
XSTR [dest4+tmp], xd4 ;Store result into dest4
XSTR [dest5+tmp], xd5 ;Store result into dest5
XSTR [dest6+tmp], xd6 ;Store result into dest6
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 32
constip32:
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
;;; func core, ver, snum
slversion gf_6vect_mad_avx2, 04, 01, 0211

View File

@ -0,0 +1,406 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_6vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%define PS 8
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define tmp.w r11d
%define tmp2 r10
%define tmp3 r13
%define tmp4 r14
%define tmp5 rdi
%define return rax
%define return.w eax
%define stack_size 16*10 + 5*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
movdqa [rsp+16*0],xmm6
movdqa [rsp+16*1],xmm7
movdqa [rsp+16*2],xmm8
movdqa [rsp+16*3],xmm9
movdqa [rsp+16*4],xmm10
movdqa [rsp+16*5],xmm11
movdqa [rsp+16*6],xmm12
movdqa [rsp+16*7],xmm13
movdqa [rsp+16*8],xmm14
movdqa [rsp+16*9],xmm15
save_reg r12, 10*16 + 0*8
save_reg r13, 10*16 + 1*8
save_reg r14, 10*16 + 2*8
save_reg r15, 10*16 + 3*8
save_reg rdi, 10*16 + 4*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp+16*0]
movdqa xmm7, [rsp+16*1]
movdqa xmm8, [rsp+16*2]
movdqa xmm9, [rsp+16*3]
movdqa xmm10, [rsp+16*4]
movdqa xmm11, [rsp+16*5]
movdqa xmm12, [rsp+16*6]
movdqa xmm13, [rsp+16*7]
movdqa xmm14, [rsp+16*8]
movdqa xmm15, [rsp+16*9]
mov r12, [rsp + 10*16 + 0*8]
mov r13, [rsp + 10*16 + 1*8]
mov r14, [rsp + 10*16 + 2*8]
mov r15, [rsp + 10*16 + 3*8]
mov rdi, [rsp + 10*16 + 4*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp.w r11d
%define tmp2 r10
%define tmp3 r12
%define tmp4 r13
%define tmp5 r14
%define return rax
%define return.w eax
%define func(x) x:
%macro FUNC_SAVE 0
push r12
push r13
push r14
%endmacro
%macro FUNC_RESTORE 0
pop r14
pop r13
pop r12
%endmacro
%endif
;;; gf_6vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest1 arg5
%define pos return
%define pos.w return.w
%define dest2 mul_array
%define dest3 tmp2
%define dest4 tmp4
%define dest5 tmp5
%define dest6 vec_i
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR movdqu
%define XSTR movdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR movdqa
%define XSTR movdqa
%else
%define XLDR movntdqa
%define XSTR movntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft4_lo xmm14
%define xgft4_hi xmm13
%define xgft5_lo xmm12
%define xgft5_hi xmm11
%define xgft6_lo xmm10
%define xgft6_hi xmm9
%define x0 xmm0
%define xtmpa xmm1
%define xtmph1 xmm2
%define xtmpl1 xmm3
%define xtmph2 xmm4
%define xtmpl2 xmm5
%define xtmph3 xmm6
%define xtmpl3 xmm7
%define xd1 xmm8
%define xd2 xtmpl1
%define xd3 xtmph1
align 16
global gf_6vect_mad_sse:function
func(gf_6vect_mad_sse)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
mov tmp, vec
sal vec_i, 5 ;Multiply by 32
lea tmp3, [mul_array + vec_i]
sal tmp, 6 ;Multiply by 64
sal vec, 5 ;Multiply by 32
lea vec_i, [tmp + vec] ;vec_i = 96
lea mul_array, [tmp + vec_i] ;mul_array = 160
movdqu xgft5_lo, [tmp3+2*tmp] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
movdqu xgft5_hi, [tmp3+2*tmp+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
movdqu xgft4_lo, [tmp3+vec_i] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
movdqu xgft4_hi, [tmp3+vec_i+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
movdqu xgft6_lo, [tmp3+mul_array] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
movdqu xgft6_hi, [tmp3+mul_array+16] ; " Fx{00}, Fx{10}, ..., Fx{f0}
mov dest2, [dest1+PS]
mov dest3, [dest1+2*PS]
mov dest4, [dest1+3*PS] ; reuse mul_array
mov dest5, [dest1+4*PS]
mov dest6, [dest1+5*PS] ; reuse vec_i
mov dest1, [dest1]
.loop16:
XLDR x0, [src+pos] ;Get next source vector
movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
movdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
XLDR xd1, [dest1+pos] ;Get next dest vector
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
;dest1
pshufb xtmph1, x0 ;Lookup mul table of high nibble
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
pxor xtmph1, xtmpl1 ;GF add high and low partials
pxor xd1, xtmph1
XLDR xd2, [dest2+pos] ;reuse xtmpl1. Get next dest vector
XLDR xd3, [dest3+pos] ;reuse xtmph1. Get next dest3 vector
;dest2
pshufb xtmph2, x0 ;Lookup mul table of high nibble
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
pxor xtmph2, xtmpl2 ;GF add high and low partials
pxor xd2, xtmph2
;dest3
pshufb xtmph3, x0 ;Lookup mul table of high nibble
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
pxor xtmph3, xtmpl3 ;GF add high and low partials
pxor xd3, xtmph3
XSTR [dest1+pos], xd1 ;Store result into dest1
XSTR [dest2+pos], xd2 ;Store result into dest2
XSTR [dest3+pos], xd3 ;Store result into dest3
movdqa xtmph1, xgft4_hi ;Reload const array registers
movdqa xtmpl1, xgft4_lo ;Reload const array registers
movdqa xtmph2, xgft5_hi ;Reload const array registers
movdqa xtmpl2, xgft5_lo ;Reload const array registers
movdqa xtmph3, xgft6_hi ;Reload const array registers
movdqa xtmpl3, xgft6_lo ;Reload const array registers
;dest4
XLDR xd1, [dest4+pos] ;Get next dest vector
pshufb xtmph1, x0 ;Lookup mul table of high nibble
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
pxor xtmph1, xtmpl1 ;GF add high and low partials
pxor xd1, xtmph1
XLDR xd2, [dest5+pos] ;reuse xtmpl1. Get next dest vector
XLDR xd3, [dest6+pos] ;reuse xtmph1. Get next dest vector
;dest5
pshufb xtmph2, x0 ;Lookup mul table of high nibble
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
pxor xtmph2, xtmpl2 ;GF add high and low partials
pxor xd2, xtmph2
;dest6
pshufb xtmph3, x0 ;Lookup mul table of high nibble
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
pxor xtmph3, xtmpl3 ;GF add high and low partials
pxor xd3, xtmph3
XSTR [dest4+pos], xd1 ;Store result into dest4
XSTR [dest5+pos], xd2 ;Store result into dest5
XSTR [dest6+pos], xd3 ;Store result into dest6
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
.lessthan16:
;; Tail len
;; Do one more overlap pass
;; Overlapped offset length-16
mov tmp, len ;Backup len as len=rdi
XLDR x0, [src+tmp] ;Get next source vector
XLDR xd1, [dest4+tmp] ;Get next dest vector
XLDR xd2, [dest5+tmp] ;reuse xtmpl1. Get next dest vector
XLDR xd3, [dest6+tmp] ;reuse xtmph1. Get next dest vector
sub len, pos
movdqa xtmph3, [constip16] ;Load const of i + 16
pinsrb xtmpl3, len.w, 15
pshufb xtmpl3, xmask0f ;Broadcast len to all bytes
pcmpgtb xtmpl3, xtmph3
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
;dest4
pshufb xgft4_hi, x0 ;Lookup mul table of high nibble
pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft4_hi, xgft4_lo ;GF add high and low partials
pand xgft4_hi, xtmpl3
pxor xd1, xgft4_hi
;dest5
pshufb xgft5_hi, x0 ;Lookup mul table of high nibble
pshufb xgft5_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft5_hi, xgft5_lo ;GF add high and low partials
pand xgft5_hi, xtmpl3
pxor xd2, xgft5_hi
;dest6
pshufb xgft6_hi, x0 ;Lookup mul table of high nibble
pshufb xgft6_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft6_hi, xgft6_lo ;GF add high and low partials
pand xgft6_hi, xtmpl3
pxor xd3, xgft6_hi
XSTR [dest4+tmp], xd1 ;Store result into dest4
XSTR [dest5+tmp], xd2 ;Store result into dest5
XSTR [dest6+tmp], xd3 ;Store result into dest6
movdqu xgft4_lo, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
movdqu xgft4_hi, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
movdqu xgft5_lo, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
movdqu xgft5_hi, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
movdqu xgft6_lo, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
movdqu xgft6_hi, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
XLDR xd1, [dest1+tmp] ;Get next dest vector
XLDR xd2, [dest2+tmp] ;reuse xtmpl1. Get next dest vector
XLDR xd3, [dest3+tmp] ;reuse xtmph1. Get next dest3 vector
;dest1
pshufb xgft4_hi, x0 ;Lookup mul table of high nibble
pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft4_hi, xgft4_lo ;GF add high and low partials
pand xgft4_hi, xtmpl3
pxor xd1, xgft4_hi
;dest2
pshufb xgft5_hi, x0 ;Lookup mul table of high nibble
pshufb xgft5_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft5_hi, xgft5_lo ;GF add high and low partials
pand xgft5_hi, xtmpl3
pxor xd2, xgft5_hi
;dest3
pshufb xgft6_hi, x0 ;Lookup mul table of high nibble
pshufb xgft6_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft6_hi, xgft6_lo ;GF add high and low partials
pand xgft6_hi, xtmpl3
pxor xd3, xgft6_hi
XSTR [dest1+tmp], xd1 ;Store result into dest1
XSTR [dest2+tmp], xd2 ;Store result into dest2
XSTR [dest3+tmp], xd3 ;Store result into dest3
.return_pass:
FUNC_RESTORE
mov return, 0
ret
.return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
constip16:
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
;;; func core, ver, snum
slversion gf_6vect_mad_sse, 00, 01, 020f

View File

@ -0,0 +1,225 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include <assert.h>
#include "erasure_code.h"
#define TEST_LEN 8192
#ifndef TEST_SOURCES
# define TEST_SOURCES 128
#endif
#ifndef RANDOMS
# define RANDOMS 200
#endif
#define KMAX TEST_SOURCES
typedef unsigned char u8;
void matrix_mult(u8 * a, u8 * b, u8 * c, int n)
{
int i, j, k;
u8 d;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
d = 0;
for (k = 0; k < n; k++) {
d ^= gf_mul(a[n * i + k], b[n * k + j]);
}
c[i * n + j] = d;
}
}
}
void print_matrix(u8 * a, int n)
{
int i, j;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
printf(" %2x", a[i * n + j]);
}
printf("\n");
}
printf("\n");
}
int is_ident(u8 * a, const int n)
{
int i, j;
u8 c;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c = *a++;
if (i == j)
c--;
if (c != 0)
return -1;
}
}
return 0;
}
int inv_test(u8 * in, u8 * inv, u8 * sav, int n)
{
memcpy(sav, in, n * n);
if (gf_invert_matrix(in, inv, n)) {
printf("Given singular matrix\n");
print_matrix(sav, n);
return -1;
}
matrix_mult(inv, sav, in, n);
if (is_ident(in, n)) {
printf("fail\n");
print_matrix(sav, n);
print_matrix(inv, n);
print_matrix(in, n);
return -1;
}
putchar('.');
return 0;
}
int main(int argc, char *argv[])
{
int i, k, t;
u8 *test_mat, *save_mat, *invr_mat;
u8 test1[] = { 1, 1, 6,
1, 1, 1,
7, 1, 9
};
u8 test2[] = { 0, 1, 6,
1, 0, 1,
0, 1, 9
};
u8 test3[] = { 0, 0, 1,
1, 0, 0,
0, 1, 1
};
u8 test4[] = { 0, 1, 6, 7,
1, 1, 0, 0,
0, 1, 2, 3,
3, 2, 2, 3
}; // = row3+3*row2
printf("gf_inverse_test: max=%d ", KMAX);
test_mat = malloc(KMAX * KMAX);
save_mat = malloc(KMAX * KMAX);
invr_mat = malloc(KMAX * KMAX);
if (NULL == test_mat || NULL == save_mat || NULL == invr_mat)
return -1;
// Test with lots of leading 1's
k = 3;
memcpy(test_mat, test1, k * k);
if (inv_test(test_mat, invr_mat, save_mat, k))
return -1;
// Test with leading zeros
k = 3;
memcpy(test_mat, test2, k * k);
if (inv_test(test_mat, invr_mat, save_mat, k))
return -1;
// Test 3
k = 3;
memcpy(test_mat, test3, k * k);
if (inv_test(test_mat, invr_mat, save_mat, k))
return -1;
// Test 4 - try a singular matrix
k = 4;
memcpy(test_mat, test4, k * k);
if (!gf_invert_matrix(test_mat, invr_mat, k)) {
printf("Fail: didn't catch singular matrix\n");
print_matrix(test4, 4);
return -1;
}
// Do random test of size KMAX
k = KMAX;
for (i = 0; i < k * k; i++)
test_mat[i] = save_mat[i] = rand();
if (gf_invert_matrix(test_mat, invr_mat, k)) {
printf("rand picked a singular matrix, try again\n");
return -1;
}
matrix_mult(invr_mat, save_mat, test_mat, k);
if (is_ident(test_mat, k)) {
printf("fail\n");
print_matrix(save_mat, k);
print_matrix(invr_mat, k);
print_matrix(test_mat, k);
return -1;
}
// Do Randoms. Random size and coefficients
for (t = 0; t < RANDOMS; t++) {
k = rand() % KMAX;
for (i = 0; i < k * k; i++)
test_mat[i] = save_mat[i] = rand();
if (gf_invert_matrix(test_mat, invr_mat, k))
continue;
matrix_mult(invr_mat, save_mat, test_mat, k);
if (is_ident(test_mat, k)) {
printf("fail rand k=%d\n", k);
print_matrix(save_mat, k);
print_matrix(invr_mat, k);
print_matrix(test_mat, k);
return -1;
}
if (0 == (t % 8))
putchar('.');
}
printf(" Pass\n");
return 0;
}

View File

@ -0,0 +1,166 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "test.h"
#include "erasure_code.h"
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 10
# define TEST_LEN 8*1024
# define TEST_LOOPS 4000
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 10
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN GT_L3_CACHE / TEST_SOURCES
# define TEST_LOOPS 10
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS 1000
# endif
# endif
#endif
typedef unsigned char u8;
// Global GF(256) tables
u8 gff[256];
u8 gflog[256];
u8 gf_mul_table[256 * 256];
void mk_gf_field(void)
{
int i;
u8 s = 1;
gflog[0] = 0;
for (i = 0; i < 256; i++) {
gff[i] = s;
gflog[s] = i;
s = (s << 1) ^ ((s & 0x80) ? 0x1d : 0); // mult by GF{2}
}
}
void mk_gf_mul_table(u8 * table)
{
// Populate a single table with all multiply combinations for a fast,
// single-table lookup of GF(2^8) multiply at the expense of memory.
int i, j;
for (i = 0; i < 256; i++)
for (j = 0; j < 256; j++)
table[i * 256 + j] = gf_mul(i, j);
}
void gf_vect_dot_prod_ref(int len, int vlen, u8 * v, u8 ** src, u8 * dest)
{
int i, j;
u8 s;
for (i = 0; i < len; i++) {
s = 0;
for (j = 0; j < vlen; j++)
s ^= gf_mul(src[j][i], v[j]);
dest[i] = s;
}
}
int main(void)
{
int i, j, k;
u8 s, vec[TEST_SOURCES], dest1[TEST_LEN], dest2[TEST_LEN];
u8 *matrix[TEST_SOURCES];
struct perf start, stop;
mk_gf_field();
mk_gf_mul_table(gf_mul_table);
//generate random vector and matrix/data
for (i = 0; i < TEST_SOURCES; i++) {
vec[i] = rand();
if (!(matrix[i] = malloc(TEST_LEN))) {
fprintf(stderr, "Error failure\n\n");
return -1;
}
for (j = 0; j < TEST_LEN; j++)
matrix[i][j] = rand();
}
gf_vect_dot_prod_ref(TEST_LEN, TEST_SOURCES, vec, matrix, dest1);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++)
gf_vect_dot_prod_ref(TEST_LEN, TEST_SOURCES, vec, matrix, dest1);
perf_stop(&stop);
printf("gf_vect_dot_prod_2tbl" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
// Warm up mult tables
for (i = 0; i < TEST_LEN; i++) {
s = 0;
for (j = 0; j < TEST_SOURCES; j++) {
s ^= gf_mul_table[vec[j] * 256 + matrix[j][i]];
}
dest2[i] = s;
}
perf_start(&start);
for (k = 0; k < TEST_LOOPS; k++) {
for (i = 0; i < TEST_LEN; i++) {
s = 0;
for (j = 0; j < TEST_SOURCES; j++) {
s ^= gf_mul_table[vec[j] * 256 + matrix[j][i]];
}
dest2[i] = s;
}
}
perf_stop(&stop);
printf("gf_vect_dot_prod_1tbl" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * k);
// Compare with reference function
if (0 != memcmp(dest1, dest2, TEST_LEN)) {
printf("Error, different results!\n\n");
return -1;
}
printf("Pass functional test\n");
return 0;
}

View File

@ -0,0 +1,271 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_vect_dot_prod_avx(len, vec, *g_tbls, **buffs, *dest);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define tmp r11
%define tmp2 r10
%define tmp3 r9
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved and loaded
%define tmp r11
%define tmp2 r10
%define tmp3 rdi ; must be saved and loaded
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define frame_size 2*8
%define arg(x) [rsp + frame_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
rex_push_reg r12
push_reg rdi
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
pop rdi
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, elf32
;;;================== High Address;
;;; arg4
;;; arg3
;;; arg2
;;; arg1
;;; arg0
;;; return
;;;<================= esp of caller
;;; ebp
;;;<================= ebp = esp
;;; esi
;;; edi
;;; ebx
;;;<================= esp of callee
;;;
;;;================== Low Address;
%define PS 4
%define LOG_PS 2
%define func(x) x:
%define arg(x) [ebp + PS*2 + PS*x]
%define trans ecx ;trans is for the variables in stack
%define arg0 trans
%define arg0_m arg(0)
%define arg1 trans
%define arg1_m arg(1)
%define arg2 arg2_m
%define arg2_m arg(2)
%define arg3 ebx
%define arg4 trans
%define arg4_m arg(4)
%define tmp edx
%define tmp2 edi
%define tmp3 esi
%define return eax
%macro SLDR 2 ;; stack load/restore
mov %1, %2
%endmacro
%define SSTR SLDR
%macro FUNC_SAVE 0
push ebp
mov ebp, esp
push esi
push edi
push ebx
mov arg3, arg(3)
%endmacro
%macro FUNC_RESTORE 0
pop ebx
pop edi
pop esi
mov esp, ebp
pop ebp
%endmacro
%endif ; output formats
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest arg4
%define vec_i tmp2
%define ptr tmp3
%define pos return
%ifidn PS,4 ;32-bit code
%define vec_m arg1_m
%define len_m arg0_m
%define dest_m arg4_m
%endif
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
%ifidn PS,8 ; 64-bit code
default rel
[bits 64]
%endif
section .text
%define xmask0f xmm5
%define xgft_lo xmm4
%define xgft_hi xmm3
%define x0 xmm0
%define xtmpa xmm1
%define xp xmm2
align 16
global gf_vect_dot_prod_avx:function
func(gf_vect_dot_prod_avx)
FUNC_SAVE
SLDR len, len_m
sub len, 16
SSTR len_m, len
jl .return_fail
xor pos, pos
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
.loop16:
vpxor xp, xp
mov tmp, mul_array
xor vec_i, vec_i
.next_vect:
mov ptr, [src+vec_i*PS]
vmovdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
vmovdqu xgft_hi, [tmp+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
XLDR x0, [ptr+pos] ;Get next source vector
add tmp, 32
add vec_i, 1
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vpshufb xgft_hi, xgft_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft_lo, xgft_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials
vpxor xp, xp, xgft_hi ;xp += partial
SLDR vec, vec_m
cmp vec_i, vec
jl .next_vect
SLDR dest, dest_m
XSTR [dest+pos], xp
add pos, 16 ;Loop on 16 bytes at a time
SLDR len, len_m
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop16 ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f:
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_vect_dot_prod_avx, 02, 05, 0061

View File

@ -0,0 +1,280 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, *dest);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 r9
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved and loaded
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define tmp2 r10
%define tmp3 rdi ; must be saved and loaded
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define frame_size 2*8
%define arg(x) [rsp + frame_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
rex_push_reg r12
push_reg rdi
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
pop rdi
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, elf32
;;;================== High Address;
;;; arg4
;;; arg3
;;; arg2
;;; arg1
;;; arg0
;;; return
;;;<================= esp of caller
;;; ebp
;;;<================= ebp = esp
;;; esi
;;; edi
;;; ebx
;;;<================= esp of callee
;;;
;;;================== Low Address;
%define PS 4
%define LOG_PS 2
%define func(x) x:
%define arg(x) [ebp + PS*2 + PS*x]
%define trans ecx ;trans is for the variables in stack
%define arg0 trans
%define arg0_m arg(0)
%define arg1 trans
%define arg1_m arg(1)
%define arg2 arg2_m
%define arg2_m arg(2)
%define arg3 ebx
%define arg4 trans
%define arg4_m arg(4)
%define tmp edx
%define tmp.w edx
%define tmp.b dl
%define tmp2 edi
%define tmp3 esi
%define return eax
%macro SLDR 2 ;stack load/restore
mov %1, %2
%endmacro
%define SSTR SLDR
%macro FUNC_SAVE 0
push ebp
mov ebp, esp
push esi
push edi
push ebx
mov arg3, arg(3)
%endmacro
%macro FUNC_RESTORE 0
pop ebx
pop edi
pop esi
mov esp, ebp
pop ebp
%endmacro
%endif ; output formats
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest arg4
%define vec_i tmp2
%define ptr tmp3
%define pos return
%ifidn PS,4 ;32-bit code
%define vec_m arg1_m
%define len_m arg0_m
%define dest_m arg4_m
%endif
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
%ifidn PS,8 ;64-bit code
default rel
[bits 64]
%endif
section .text
%define xmask0f ymm3
%define xmask0fx xmm3
%define xgft_lo ymm4
%define xgft_hi ymm5
%define x0 ymm0
%define xtmpa ymm1
%define xp ymm2
align 16
global gf_vect_dot_prod_avx2:function
func(gf_vect_dot_prod_avx2)
FUNC_SAVE
SLDR len, len_m
sub len, 32
SSTR len_m, len
jl .return_fail
xor pos, pos
mov tmp.b, 0x0f
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
.loop32:
vpxor xp, xp
mov tmp, mul_array
xor vec_i, vec_i
.next_vect:
mov ptr, [src+vec_i*PS]
vmovdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
XLDR x0, [ptr+pos] ;Get next source vector
add tmp, 32
add vec_i, 1
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vpshufb xgft_hi, xgft_hi, x0 ;Lookup mul table of high nibble
vpshufb xgft_lo, xgft_lo, xtmpa ;Lookup mul table of low nibble
vpxor xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials
vpxor xp, xp, xgft_hi ;xp += partial
SLDR vec, vec_m
cmp vec_i, vec
jl .next_vect
SLDR dest, dest_m
XSTR [dest+pos], xp
add pos, 32 ;Loop on 32 bytes at a time
SLDR len, len_m
cmp pos, len
jle .loop32
lea tmp, [len + 32]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-32
jmp .loop32 ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
;;; func core, ver, snum
slversion gf_vect_dot_prod_avx2, 04, 05, 0190

View File

@ -0,0 +1,184 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "test.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_vect_dot_prod_avx
#endif
#define str(s) #s
#define xstr(s) str(s)
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 10
# define TEST_LEN 8*1024
# define TEST_LOOPS 40000
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 10
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
# define TEST_LOOPS 100
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS 1000
# endif
# endif
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j;
void *buf;
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], *dest, *dest_ref;
u8 *temp_buff, *buffs[TEST_SOURCES];
struct perf start, stop;
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
temp_buff = buf;
// Performance test
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
memset(dest, 0, TEST_LEN);
memset(temp_buff, 0, TEST_LEN);
memset(dest_ref, 0, TEST_LEN);
memset(g, 0, TEST_SOURCES);
for (i = 0; i < TEST_SOURCES; i++)
g[i] = rand();
for (j = 0; j < TEST_SOURCES; j++)
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
#ifdef DO_REF_PERF
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
for (j = 0; j < TEST_SOURCES; j++)
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
}
perf_stop(&stop);
printf("gf_vect_dot_prod_base" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
#endif
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
for (j = 0; j < TEST_SOURCES; j++)
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(dest, 25);
return -1;
}
printf("pass perf check\n");
return 0;
}

View File

@ -0,0 +1,525 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_vect_dot_prod_avx
#endif
#ifndef TEST_MIN_SIZE
# define TEST_MIN_SIZE 16
#endif
#define str(s) #s
#define xstr(s) str(s)
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#ifndef TEST_SOURCES
# define TEST_SOURCES 16
#endif
#ifndef RANDOMS
# define RANDOMS 20
#endif
#define MMAX TEST_SOURCES
#define KMAX TEST_SOURCES
#ifdef EC_ALIGNED_ADDR
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 0
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
#else
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 32
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j, rtest, srcs, m, k, nerrs, r, err;
void *buf;
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
u8 *dest, *dest_ref, *temp_buff, *buffs[TEST_SOURCES];
u8 a[MMAX * KMAX], b[MMAX * KMAX], d[MMAX * KMAX];
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
int align, size;
unsigned char *efence_buffs[TEST_SOURCES];
unsigned int offset;
u8 *ubuffs[TEST_SOURCES];
u8 *udest_ptr;
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
temp_buff = buf;
// Test of all zeros
for (i = 0; i < TEST_SOURCES; i++)
memset(buffs[i], 0, TEST_LEN);
memset(dest, 0, TEST_LEN);
memset(temp_buff, 0, TEST_LEN);
memset(dest_ref, 0, TEST_LEN);
memset(g, 0, TEST_SOURCES);
for (i = 0; i < TEST_SOURCES; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " \n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(dest, 25);
return -1;
} else
putchar('.');
// Rand data test
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++)
g[i] = rand();
for (i = 0; i < TEST_SOURCES; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " 1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(dest, 25);
return -1;
}
putchar('.');
}
// Rand data test with varied parameters
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
for (i = 0; i < srcs; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++)
g[i] = rand();
for (i = 0; i < srcs; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref);
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest);
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 2\n");
dump_matrix(buffs, 5, srcs);
printf("dprod_base:");
dump(dest_ref, 5);
printf("dprod:");
dump(dest, 5);
return -1;
}
putchar('.');
}
}
// Test erasure code using gf_vect_dot_prod
// Pick a first test
m = 9;
k = 5;
if (m > MMAX || k > KMAX)
return -1;
gf_gen_rs_matrix(a, m, k);
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// Make parity vects
for (i = k; i < m; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
#ifndef USEREF
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
#else
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
#endif
}
// Random buffers in erasure
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
err = 1 & rand();
src_in_err[i] = err;
if (err)
src_err_list[nerrs++] = i;
}
// construct b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
for (j = 0; j < k; j++)
b[k * i + j] = a[k * r + j];
}
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
printf("BAD MATRIX\n");
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
recov[i] = buffs[r];
}
// Recover data
for (i = 0; i < nerrs; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
#ifndef USEREF
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
#else
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
#endif
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
printf("recov %d:", src_err_list[i]);
dump(temp_buff, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
return -1;
}
}
// Do more random tests
for (rtest = 0; rtest < RANDOMS; rtest++) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
gf_gen_rs_matrix(a, m, k);
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// Make parity vects
for (i = k; i < m; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
#ifndef USEREF
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
#else
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
#endif
}
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
err = 1 & rand();
src_in_err[i] = err;
if (err)
src_err_list[nerrs++] = i;
}
if (nerrs == 0) { // should have at least one error
while ((err = (rand() % KMAX)) >= k) ;
src_err_list[nerrs++] = err;
src_in_err[err] = 1;
}
// construct b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
for (j = 0; j < k; j++)
b[k * i + j] = a[k * r + j];
}
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
printf("BAD MATRIX\n");
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
recov[i] = buffs[r];
}
// Recover data
for (i = 0; i < nerrs; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
#ifndef USEREF
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
#else
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
#endif
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (i = 0; i < nerrs; i++)
printf(" %d", src_err_list[i]);
printf("\na:\n");
dump_u8xu8((u8 *) a, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) d, k, k);
printf("orig data:\n");
dump_matrix(buffs, m, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buff, 25);
return -1;
}
}
putchar('.');
}
// Run tests at end of buffer for Electric Fence
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
efence_buffs[i] = buffs[i] + TEST_LEN - size;
for (i = 0; i < TEST_SOURCES; i++)
g[i] = rand();
for (i = 0; i < TEST_SOURCES; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref);
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest);
if (0 != memcmp(dest_ref, dest, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 3\n");
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, align);
printf("dprod:");
dump(dest, align);
return -1;
}
putchar('.');
}
// Test rand ptr alignment if available
for (rtest = 0; rtest < RANDOMS; rtest++) {
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
srcs = rand() % TEST_SOURCES;
if (srcs == 0)
continue;
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
// Add random offsets
for (i = 0; i < srcs; i++)
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptr = dest + (rand() & (PTR_ALIGN_CHK_B - offset));
memset(dest, 0, TEST_LEN); // zero pad to check write-over
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
ubuffs[i][j] = rand();
for (i = 0; i < srcs; i++)
g[i] = rand();
for (i = 0; i < srcs; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptr);
if (memcmp(dest_ref, udest_ptr, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(udest_ptr, 25);
return -1;
}
// Confirm that padding around dests is unchanged
memset(dest_ref, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
offset = udest_ptr - dest;
if (memcmp(dest, dest_ref, offset)) {
printf("Fail rand ualign pad start\n");
return -1;
}
if (memcmp(dest + offset + size, dest_ref, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad end\n");
return -1;
}
putchar('.');
}
// Test all size alignment
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
srcs = TEST_SOURCES;
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++)
g[i] = rand();
for (i = 0; i < srcs; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest);
if (memcmp(dest_ref, dest, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(dest, 25);
return -1;
}
}
printf("done all: Pass\n");
return 0;
}

View File

@ -0,0 +1,290 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#ifndef TEST_SOURCES
# define TEST_SOURCES 250
#endif
#ifndef RANDOMS
# define RANDOMS 20
#endif
#define MMAX TEST_SOURCES
#define KMAX TEST_SOURCES
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j, rtest, m, k, nerrs, r, err;
void *buf;
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
u8 *dest, *dest_ref, *temp_buff, *buffs[TEST_SOURCES];
u8 a[MMAX * KMAX], b[MMAX * KMAX], d[MMAX * KMAX];
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
printf("gf_vect_dot_prod_base: %dx%d ", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
temp_buff = buf;
// Init
for (i = 0; i < TEST_SOURCES; i++)
memset(buffs[i], 0, TEST_LEN);
memset(dest, 0, TEST_LEN);
memset(temp_buff, 0, TEST_LEN);
memset(dest_ref, 0, TEST_LEN);
memset(g, 0, TEST_SOURCES);
// Test erasure code using gf_vect_dot_prod
// Pick a first test
m = 9;
k = 5;
if (m > MMAX || k > KMAX)
return -1;
gf_gen_cauchy1_matrix(a, m, k);
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// Make parity vects
for (i = k; i < m; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
gf_vect_dot_prod_base(TEST_LEN, k, g_tbls, buffs, buffs[i]);
}
// Random buffers in erasure
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
err = 1 & rand();
src_in_err[i] = err;
if (err)
src_err_list[nerrs++] = i;
}
// construct b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
for (j = 0; j < k; j++)
b[k * i + j] = a[k * r + j];
}
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
printf("BAD MATRIX\n");
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
recov[i] = buffs[r];
}
// Recover data
for (i = 0; i < nerrs; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
gf_vect_dot_prod_base(TEST_LEN, k, g_tbls, recov, temp_buff);
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
printf("recov %d:", src_err_list[i]);
dump(temp_buff, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
return -1;
}
}
// Do more random tests
for (rtest = 0; rtest < RANDOMS; rtest++) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
gf_gen_cauchy1_matrix(a, m, k);
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// Make parity vects
for (i = k; i < m; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
gf_vect_dot_prod_base(TEST_LEN, k, g_tbls, buffs, buffs[i]);
}
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
err = 1 & rand();
src_in_err[i] = err;
if (err)
src_err_list[nerrs++] = i;
}
if (nerrs == 0) { // should have at least one error
while ((err = (rand() % KMAX)) >= k) ;
src_err_list[nerrs++] = err;
src_in_err[err] = 1;
}
// construct b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
for (j = 0; j < k; j++)
b[k * i + j] = a[k * r + j];
}
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
printf("BAD MATRIX\n");
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
recov[i] = buffs[r];
}
// Recover data
for (i = 0; i < nerrs; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
gf_vect_dot_prod_base(TEST_LEN, k, g_tbls, recov, temp_buff);
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (i = 0; i < nerrs; i++)
printf(" %d", src_err_list[i]);
printf("\na:\n");
dump_u8xu8((u8 *) a, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) d, k, k);
printf("orig data:\n");
dump_matrix(buffs, m, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buff, 25);
return -1;
}
}
putchar('.');
}
printf("done all: Pass\n");
return 0;
}

View File

@ -0,0 +1,184 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "test.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_vect_dot_prod
#endif
#define str(s) #s
#define xstr(s) str(s)
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 10
# define TEST_LEN 8*1024
# define TEST_LOOPS 40000
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 10
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
# define TEST_LOOPS 100
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS 1000
# endif
# endif
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j;
void *buf;
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], *dest, *dest_ref;
u8 *temp_buff, *buffs[TEST_SOURCES];
struct perf start, stop;
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
temp_buff = buf;
// Performance test
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
memset(dest, 0, TEST_LEN);
memset(temp_buff, 0, TEST_LEN);
memset(dest_ref, 0, TEST_LEN);
memset(g, 0, TEST_SOURCES);
for (i = 0; i < TEST_SOURCES; i++)
g[i] = rand();
for (j = 0; j < TEST_SOURCES; j++)
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
#ifdef DO_REF_PERF
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
for (j = 0; j < TEST_SOURCES; j++)
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
}
perf_stop(&stop);
printf("gf_vect_dot_prod_base" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
#endif
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
for (j = 0; j < TEST_SOURCES; j++)
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(dest, 25);
return -1;
}
printf("pass perf check\n");
return 0;
}

View File

@ -0,0 +1,271 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_vect_dot_prod_sse(len, vec, *g_tbls, **buffs, *dest);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define tmp r11
%define tmp2 r10
%define tmp3 r9
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved and loaded
%define tmp r11
%define tmp2 r10
%define tmp3 rdi ; must be saved and loaded
%define return rax
%macro SLDR 2
%endmacro
%define SSTR SLDR
%define PS 8
%define frame_size 2*8
%define arg(x) [rsp + frame_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
rex_push_reg r12
push_reg rdi
end_prolog
mov arg4, arg(4)
%endmacro
%macro FUNC_RESTORE 0
pop rdi
pop r12
%endmacro
%endif
%ifidn __OUTPUT_FORMAT__, elf32
;;;================== High Address;
;;; arg4
;;; arg3
;;; arg2
;;; arg1
;;; arg0
;;; return
;;;<================= esp of caller
;;; ebp
;;;<================= ebp = esp
;;; esi
;;; edi
;;; ebx
;;;<================= esp of callee
;;;
;;;================== Low Address;
%define PS 4
%define LOG_PS 2
%define func(x) x:
%define arg(x) [ebp + PS*2 + PS*x]
%define trans ecx ;trans is for the variables in stack
%define arg0 trans
%define arg0_m arg(0)
%define arg1 trans
%define arg1_m arg(1)
%define arg2 arg2_m
%define arg2_m arg(2)
%define arg3 ebx
%define arg4 trans
%define arg4_m arg(4)
%define tmp edx
%define tmp2 edi
%define tmp3 esi
%define return eax
%macro SLDR 2 ;; stack load/restore
mov %1, %2
%endmacro
%define SSTR SLDR
%macro FUNC_SAVE 0
push ebp
mov ebp, esp
push esi
push edi
push ebx
mov arg3, arg(3)
%endmacro
%macro FUNC_RESTORE 0
pop ebx
pop edi
pop esi
mov esp, ebp
pop ebp
%endmacro
%endif ; output formats
%define len arg0
%define vec arg1
%define mul_array arg2
%define src arg3
%define dest arg4
%define vec_i tmp2
%define ptr tmp3
%define pos return
%ifidn PS,4 ;32-bit code
%define vec_m arg1_m
%define len_m arg0_m
%define dest_m arg4_m
%endif
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR movdqu
%define XSTR movdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR movdqa
%define XSTR movdqa
%else
%define XLDR movntdqa
%define XSTR movntdq
%endif
%endif
%ifidn PS,8 ;64-bit code
default rel
[bits 64]
%endif
section .text
%define xmask0f xmm5
%define xgft_lo xmm4
%define xgft_hi xmm3
%define x0 xmm0
%define xtmpa xmm1
%define xp xmm2
align 16
global gf_vect_dot_prod_sse:function
func(gf_vect_dot_prod_sse)
FUNC_SAVE
SLDR len, len_m
sub len, 16
SSTR len_m, len
jl .return_fail
xor pos, pos
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
.loop16:
pxor xp, xp
mov tmp, mul_array
xor vec_i, vec_i
.next_vect:
mov ptr, [src+vec_i*PS]
movdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
movdqu xgft_hi, [tmp+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
XLDR x0, [ptr+pos] ;Get next source vector
add tmp, 32
add vec_i, 1
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
pshufb xgft_hi, x0 ;Lookup mul table of high nibble
pshufb xgft_lo, xtmpa ;Lookup mul table of low nibble
pxor xgft_hi, xgft_lo ;GF add high and low partials
pxor xp, xgft_hi ;xp += partial
SLDR vec, vec_m
cmp vec_i, vec
jl .next_vect
SLDR dest, dest_m
XSTR [dest+pos], xp
add pos, 16 ;Loop on 16 bytes at a time
SLDR len, len_m
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
jmp .loop16 ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_vect_dot_prod_sse, 00, 05, 0060

View File

@ -0,0 +1,184 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "test.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_vect_dot_prod_sse
#endif
#define str(s) #s
#define xstr(s) str(s)
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 10
# define TEST_LEN 8*1024
# define TEST_LOOPS 40000
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 10
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
# define TEST_LOOPS 100
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS 1000
# endif
# endif
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j;
void *buf;
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], *dest, *dest_ref;
u8 *temp_buff, *buffs[TEST_SOURCES];
struct perf start, stop;
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
temp_buff = buf;
// Performance test
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
memset(dest, 0, TEST_LEN);
memset(temp_buff, 0, TEST_LEN);
memset(dest_ref, 0, TEST_LEN);
memset(g, 0, TEST_SOURCES);
for (i = 0; i < TEST_SOURCES; i++)
g[i] = rand();
for (j = 0; j < TEST_SOURCES; j++)
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
#ifdef DO_REF_PERF
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
for (j = 0; j < TEST_SOURCES; j++)
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
}
perf_stop(&stop);
printf("gf_vect_dot_prod_base" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
#endif
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
for (j = 0; j < TEST_SOURCES; j++)
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(dest, 25);
return -1;
}
printf("pass perf check\n");
return 0;
}

View File

@ -0,0 +1,525 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_vect_dot_prod_sse
#endif
#ifndef TEST_MIN_SIZE
# define TEST_MIN_SIZE 16
#endif
#define str(s) #s
#define xstr(s) str(s)
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#ifndef TEST_SOURCES
# define TEST_SOURCES 16
#endif
#ifndef RANDOMS
# define RANDOMS 20
#endif
#define MMAX TEST_SOURCES
#define KMAX TEST_SOURCES
#ifdef EC_ALIGNED_ADDR
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 0
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
#else
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 32
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j, rtest, srcs, m, k, nerrs, r, err;
void *buf;
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
u8 *dest, *dest_ref, *temp_buff, *buffs[TEST_SOURCES];
u8 a[MMAX * KMAX], b[MMAX * KMAX], d[MMAX * KMAX];
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
int align, size;
unsigned char *efence_buffs[TEST_SOURCES];
unsigned int offset;
u8 *ubuffs[TEST_SOURCES];
u8 *udest_ptr;
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
temp_buff = buf;
// Test of all zeros
for (i = 0; i < TEST_SOURCES; i++)
memset(buffs[i], 0, TEST_LEN);
memset(dest, 0, TEST_LEN);
memset(temp_buff, 0, TEST_LEN);
memset(dest_ref, 0, TEST_LEN);
memset(g, 0, TEST_SOURCES);
for (i = 0; i < TEST_SOURCES; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " \n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(dest, 25);
return -1;
} else
putchar('.');
// Rand data test
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++)
g[i] = rand();
for (i = 0; i < TEST_SOURCES; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " 1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(dest, 25);
return -1;
}
putchar('.');
}
// Rand data test with varied parameters
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
for (i = 0; i < srcs; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++)
g[i] = rand();
for (i = 0; i < srcs; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref);
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest);
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 2\n");
dump_matrix(buffs, 5, srcs);
printf("dprod_base:");
dump(dest_ref, 5);
printf("dprod:");
dump(dest, 5);
return -1;
}
putchar('.');
}
}
// Test erasure code using gf_vect_dot_prod
// Pick a first test
m = 9;
k = 5;
if (m > MMAX || k > KMAX)
return -1;
gf_gen_rs_matrix(a, m, k);
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// Make parity vects
for (i = k; i < m; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
#ifndef USEREF
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
#else
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
#endif
}
// Random buffers in erasure
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
err = 1 & rand();
src_in_err[i] = err;
if (err)
src_err_list[nerrs++] = i;
}
// construct b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
for (j = 0; j < k; j++)
b[k * i + j] = a[k * r + j];
}
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
printf("BAD MATRIX\n");
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
recov[i] = buffs[r];
}
// Recover data
for (i = 0; i < nerrs; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
#ifndef USEREF
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
#else
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
#endif
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
printf("recov %d:", src_err_list[i]);
dump(temp_buff, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
return -1;
}
}
// Do more random tests
for (rtest = 0; rtest < RANDOMS; rtest++) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
gf_gen_rs_matrix(a, m, k);
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// Make parity vects
for (i = k; i < m; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
#ifndef USEREF
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
#else
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
#endif
}
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
err = 1 & rand();
src_in_err[i] = err;
if (err)
src_err_list[nerrs++] = i;
}
if (nerrs == 0) { // should have at least one error
while ((err = (rand() % KMAX)) >= k) ;
src_err_list[nerrs++] = err;
src_in_err[err] = 1;
}
// construct b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
for (j = 0; j < k; j++)
b[k * i + j] = a[k * r + j];
}
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
printf("BAD MATRIX\n");
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
recov[i] = buffs[r];
}
// Recover data
for (i = 0; i < nerrs; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
#ifndef USEREF
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
#else
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
#endif
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (i = 0; i < nerrs; i++)
printf(" %d", src_err_list[i]);
printf("\na:\n");
dump_u8xu8((u8 *) a, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) d, k, k);
printf("orig data:\n");
dump_matrix(buffs, m, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buff, 25);
return -1;
}
}
putchar('.');
}
// Run tests at end of buffer for Electric Fence
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
efence_buffs[i] = buffs[i] + TEST_LEN - size;
for (i = 0; i < TEST_SOURCES; i++)
g[i] = rand();
for (i = 0; i < TEST_SOURCES; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref);
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest);
if (0 != memcmp(dest_ref, dest, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 3\n");
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, align);
printf("dprod:");
dump(dest, align);
return -1;
}
putchar('.');
}
// Test rand ptr alignment if available
for (rtest = 0; rtest < RANDOMS; rtest++) {
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
srcs = rand() % TEST_SOURCES;
if (srcs == 0)
continue;
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
// Add random offsets
for (i = 0; i < srcs; i++)
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptr = dest + (rand() & (PTR_ALIGN_CHK_B - offset));
memset(dest, 0, TEST_LEN); // zero pad to check write-over
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
ubuffs[i][j] = rand();
for (i = 0; i < srcs; i++)
g[i] = rand();
for (i = 0; i < srcs; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptr);
if (memcmp(dest_ref, udest_ptr, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(udest_ptr, 25);
return -1;
}
// Confirm that padding around dests is unchanged
memset(dest_ref, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
offset = udest_ptr - dest;
if (memcmp(dest, dest_ref, offset)) {
printf("Fail rand ualign pad start\n");
return -1;
}
if (memcmp(dest + offset + size, dest_ref, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad end\n");
return -1;
}
putchar('.');
}
// Test all size alignment
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
srcs = TEST_SOURCES;
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++)
g[i] = rand();
for (i = 0; i < srcs; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest);
if (memcmp(dest_ref, dest, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(dest, 25);
return -1;
}
}
printf("done all: Pass\n");
return 0;
}

View File

@ -0,0 +1,525 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_vect_dot_prod
#endif
#ifndef TEST_MIN_SIZE
# define TEST_MIN_SIZE 32
#endif
#define str(s) #s
#define xstr(s) str(s)
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#ifndef TEST_SOURCES
# define TEST_SOURCES 16
#endif
#ifndef RANDOMS
# define RANDOMS 20
#endif
#define MMAX TEST_SOURCES
#define KMAX TEST_SOURCES
#ifdef EC_ALIGNED_ADDR
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 0
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
#else
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 32
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j, rtest, srcs, m, k, nerrs, r, err;
void *buf;
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
u8 *dest, *dest_ref, *temp_buff, *buffs[TEST_SOURCES];
u8 a[MMAX * KMAX], b[MMAX * KMAX], d[MMAX * KMAX];
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
int align, size;
unsigned char *efence_buffs[TEST_SOURCES];
unsigned int offset;
u8 *ubuffs[TEST_SOURCES];
u8 *udest_ptr;
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref = buf;
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
temp_buff = buf;
// Test of all zeros
for (i = 0; i < TEST_SOURCES; i++)
memset(buffs[i], 0, TEST_LEN);
memset(dest, 0, TEST_LEN);
memset(temp_buff, 0, TEST_LEN);
memset(dest_ref, 0, TEST_LEN);
memset(g, 0, TEST_SOURCES);
for (i = 0; i < TEST_SOURCES; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " \n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(dest, 25);
return -1;
} else
putchar('.');
// Rand data test
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++)
g[i] = rand();
for (i = 0; i < TEST_SOURCES; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " 1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(dest, 25);
return -1;
}
putchar('.');
}
// Rand data test with varied parameters
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
for (i = 0; i < srcs; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++)
g[i] = rand();
for (i = 0; i < srcs; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref);
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest);
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 2\n");
dump_matrix(buffs, 5, srcs);
printf("dprod_base:");
dump(dest_ref, 5);
printf("dprod:");
dump(dest, 5);
return -1;
}
putchar('.');
}
}
// Test erasure code using gf_vect_dot_prod
// Pick a first test
m = 9;
k = 5;
if (m > MMAX || k > KMAX)
return -1;
gf_gen_rs_matrix(a, m, k);
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// Make parity vects
for (i = k; i < m; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
#ifndef USEREF
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
#else
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
#endif
}
// Random buffers in erasure
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
err = 1 & rand();
src_in_err[i] = err;
if (err)
src_err_list[nerrs++] = i;
}
// construct b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
for (j = 0; j < k; j++)
b[k * i + j] = a[k * r + j];
}
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
printf("BAD MATRIX\n");
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
recov[i] = buffs[r];
}
// Recover data
for (i = 0; i < nerrs; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
#ifndef USEREF
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
#else
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
#endif
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
printf("recov %d:", src_err_list[i]);
dump(temp_buff, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
return -1;
}
}
// Do more random tests
for (rtest = 0; rtest < RANDOMS; rtest++) {
while ((m = (rand() % MMAX)) < 2) ;
while ((k = (rand() % KMAX)) >= m || k < 1) ;
if (m > MMAX || k > KMAX)
continue;
gf_gen_rs_matrix(a, m, k);
// Make random data
for (i = 0; i < k; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
// Make parity vects
for (i = k; i < m; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
#ifndef USEREF
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
#else
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
#endif
}
// Random errors
memset(src_in_err, 0, TEST_SOURCES);
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
err = 1 & rand();
src_in_err[i] = err;
if (err)
src_err_list[nerrs++] = i;
}
if (nerrs == 0) { // should have at least one error
while ((err = (rand() % KMAX)) >= k) ;
src_err_list[nerrs++] = err;
src_in_err[err] = 1;
}
// construct b by removing error rows
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
for (j = 0; j < k; j++)
b[k * i + j] = a[k * r + j];
}
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
printf("BAD MATRIX\n");
for (i = 0, r = 0; i < k; i++, r++) {
while (src_in_err[r]) {
r++;
continue;
}
recov[i] = buffs[r];
}
// Recover data
for (i = 0; i < nerrs; i++) {
for (j = 0; j < k; j++)
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
#ifndef USEREF
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
#else
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
#endif
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
printf(" - erase list = ");
for (i = 0; i < nerrs; i++)
printf(" %d", src_err_list[i]);
printf("\na:\n");
dump_u8xu8((u8 *) a, m, k);
printf("inv b:\n");
dump_u8xu8((u8 *) d, k, k);
printf("orig data:\n");
dump_matrix(buffs, m, 25);
printf("orig :");
dump(buffs[src_err_list[i]], 25);
printf("recov %d:", src_err_list[i]);
dump(temp_buff, 25);
return -1;
}
}
putchar('.');
}
// Run tests at end of buffer for Electric Fence
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
efence_buffs[i] = buffs[i] + TEST_LEN - size;
for (i = 0; i < TEST_SOURCES; i++)
g[i] = rand();
for (i = 0; i < TEST_SOURCES; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref);
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest);
if (0 != memcmp(dest_ref, dest, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 3\n");
dump_matrix(efence_buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, align);
printf("dprod:");
dump(dest, align);
return -1;
}
putchar('.');
}
// Test rand ptr alignment if available
for (rtest = 0; rtest < RANDOMS; rtest++) {
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
srcs = rand() % TEST_SOURCES;
if (srcs == 0)
continue;
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
// Add random offsets
for (i = 0; i < srcs; i++)
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
udest_ptr = dest + (rand() & (PTR_ALIGN_CHK_B - offset));
memset(dest, 0, TEST_LEN); // zero pad to check write-over
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
ubuffs[i][j] = rand();
for (i = 0; i < srcs; i++)
g[i] = rand();
for (i = 0; i < srcs; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptr);
if (memcmp(dest_ref, udest_ptr, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign srcs=%d\n",
srcs);
dump_matrix(ubuffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(udest_ptr, 25);
return -1;
}
// Confirm that padding around dests is unchanged
memset(dest_ref, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
offset = udest_ptr - dest;
if (memcmp(dest, dest_ref, offset)) {
printf("Fail rand ualign pad start\n");
return -1;
}
if (memcmp(dest + offset + size, dest_ref, PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad end\n");
return -1;
}
putchar('.');
}
// Test all size alignment
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
srcs = TEST_SOURCES;
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
buffs[i][j] = rand();
for (i = 0; i < srcs; i++)
g[i] = rand();
for (i = 0; i < srcs; i++)
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref);
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest);
if (memcmp(dest_ref, dest, size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign len=%d\n",
size);
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref, 25);
printf("dprod:");
dump(dest, 25);
return -1;
}
}
printf("done all: Pass\n");
return 0;
}

View File

@ -0,0 +1,196 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define return rax
%define return.w eax
%define PS 8
%define stack_size 16*3 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
vmovdqa [rsp+16*0],xmm6
vmovdqa [rsp+16*1],xmm7
vmovdqa [rsp+16*2],xmm8
save_reg r12, 3*16 + 0*8
save_reg r15, 3*16 + 1*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp+16*0]
vmovdqa xmm7, [rsp+16*1]
vmovdqa xmm8, [rsp+16*2]
mov r12, [rsp + 3*16 + 0*8]
mov r15, [rsp + 3*16 + 1*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define return rax
%define return.w eax
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
;;; gf_vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest arg5
%define pos return
%define pos.w return.w
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm8
%define xgft_lo xmm7
%define xgft_hi xmm6
%define x0 xmm0
%define xtmpa xmm1
%define xtmph xmm2
%define xtmpl xmm3
%define xd xmm4
%define xtmpd xmm5
align 16
global gf_vect_mad_avx:function
func(gf_vect_mad_avx)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
sal vec_i, 5 ;Multiply by 32
vmovdqu xgft_lo, [vec_i+mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
vmovdqu xgft_hi, [vec_i+mul_array+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
XLDR xtmpd, [dest+len] ;backup the last 16 bytes in dest
.loop16:
XLDR xd, [dest+pos] ;Get next dest vector
.loop16_overlap:
XLDR x0, [src+pos] ;Get next source vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vpshufb xtmph, xgft_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
vpxor xd, xd, xtmph ;xd += partial
XSTR [dest+pos], xd
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
vmovdqa xd, xtmpd ;Restore xd
jmp .loop16_overlap ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_vect_mad_avx, 02, 01, 0201

View File

@ -0,0 +1,203 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12 ; must be saved and loaded
%define arg5 r15
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define return rax
%define return.w eax
%define PS 8
%define stack_size 16*3 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
vmovdqa [rsp+16*0],xmm6
vmovdqa [rsp+16*1],xmm7
vmovdqa [rsp+16*2],xmm8
save_reg r12, 3*16 + 0*8
save_reg r15, 3*16 + 1*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp+16*0]
vmovdqa xmm7, [rsp+16*1]
vmovdqa xmm8, [rsp+16*2]
mov r12, [rsp + 3*16 + 0*8]
mov r15, [rsp + 3*16 + 1*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define tmp.w r11d
%define tmp.b r11b
%define return rax
%define return.w eax
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
;;; gf_vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest arg5
%define pos return
%define pos.w return.w
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR vmovdqu
%define XSTR vmovdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f ymm8
%define xmask0fx xmm8
%define xgft_lo ymm7
%define xgft_hi ymm6
%define x0 ymm0
%define xtmpa ymm1
%define xtmph ymm2
%define xtmpl ymm3
%define xd ymm4
%define xtmpd ymm5
align 16
global gf_vect_mad_avx2:function
func(gf_vect_mad_avx2)
FUNC_SAVE
sub len, 32
jl .return_fail
xor pos, pos
mov tmp.b, 0x0f
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
sal vec_i, 5 ;Multiply by 32
vmovdqu xgft_lo, [vec_i+mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi
vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
XLDR xtmpd, [dest+len] ;backup the last 32 bytes in dest
.loop32:
XLDR xd, [dest+pos] ;Get next dest vector
.loop32_overlap:
XLDR x0, [src+pos] ;Get next source vector
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vpshufb xtmph, xgft_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmpl, xgft_lo, xtmpa ;Lookup mul table of low nibble
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
vpxor xd, xd, xtmph ;xd += partial
XSTR [dest+pos], xd
add pos, 32 ;Loop on 32 bytes at a time
cmp pos, len
jle .loop32
lea tmp, [len + 32]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-32
vmovdqa xd, xtmpd ;Restore xd
jmp .loop32_overlap ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
;;; func core, ver, snum
slversion gf_vect_mad_avx2, 04, 01, 0202

View File

@ -0,0 +1,374 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#include "test.h"
//By default, test sse version
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_4vect_mad_sse
# define REF_FUNCTION gf_4vect_dot_prod_sse
# define VECT 4
#endif
#define str(s) #s
#define xstr(s) str(s)
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_SOURCES 10
# define TEST_LEN 8*1024
# define TEST_LOOPS 40000
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 10
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
# define TEST_LOOPS 100
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS 1000
# endif
# endif
#endif
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j, l;
void *buf;
u8 gf[6][TEST_SOURCES];
u8 *g_tbls;
u8 *dest_ref[VECT];
u8 *dest_ptrs[VECT], *buffs[TEST_SOURCES];
u8 *dest_perf_ptrs[VECT];
struct perf start, stop;
printf("test " xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 16, VECT * TEST_SOURCES * 32)) {
printf("alloc error: Fail");
return -1;
}
g_tbls = buf;
for (i = 0; i < VECT; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ptrs[i] = buf;
memset(dest_ptrs[i], 0, TEST_LEN);
}
for (i = 0; i < VECT; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref[i] = buf;
memset(dest_ref[i], 0, TEST_LEN);
}
for (i = 0; i < VECT; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_perf_ptrs[i] = buf;
memset(dest_perf_ptrs[i], 0, TEST_LEN);
}
// Performance test
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < VECT; i++)
for (j = 0; j < TEST_SOURCES; j++) {
gf[i][j] = rand();
gf_vect_mul_init(gf[i][j], &g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
}
for (i = 0; i < VECT; i++)
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[i * 32 * TEST_SOURCES],
buffs, dest_ref[i]);
for (i = 0; i < VECT; i++)
memset(dest_ptrs[i], 0, TEST_LEN);
for (i = 0; i < TEST_SOURCES; i++) {
#if (VECT == 1)
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], *dest_ptrs);
#else
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], dest_ptrs);
#endif
}
for (i = 0; i < VECT; i++) {
if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref[i], 25);
printf("dprod_dut:");
dump(dest_ptrs[i], 25);
return -1;
}
}
#if (VECT == 1)
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, *dest_ref);
#else
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ref);
#endif
for (i = 0; i < VECT; i++) {
if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
dump_matrix(buffs, 5, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref[i], 25);
printf("dprod_dut:");
dump(dest_ptrs[i], 25);
return -1;
}
}
#ifdef DO_REF_PERF
#if (VECT == 1)
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, *dest_ref);
#else
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ref);
#endif
perf_start(&start);
for (l = 0; l < TEST_LOOPS; l++) {
for (j = 0; j < TEST_SOURCES; j++) {
#if (VECT == 1)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
#elif (VECT == 2)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
#elif (VECT == 3)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
#elif (VECT == 4)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
#elif (VECT == 5)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
#elif (VECT == 6)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[5][j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
#endif
}
#if (VECT == 1)
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, *dest_ref);
#else
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ref);
#endif
}
perf_stop(&stop);
printf(xstr(REF_FUNCTION) TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + VECT) * TEST_LOOPS);
#endif
for (i = 0; i < TEST_SOURCES; i++) {
#if (VECT == 1)
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
*dest_perf_ptrs);
#else
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
dest_perf_ptrs);
#endif
}
perf_start(&start);
for (l = 0; l < TEST_LOOPS; l++) {
for (j = 0; j < TEST_SOURCES; j++) {
#if (VECT == 1)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
#elif (VECT == 2)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
#elif (VECT == 3)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
#elif (VECT == 4)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
#elif (VECT == 5)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
#elif (VECT == 6)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[5][j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
#endif
}
for (i = 0; i < TEST_SOURCES; i++) {
#if (VECT == 1)
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
*dest_perf_ptrs);
#else
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
dest_perf_ptrs);
#endif
}
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + VECT) * TEST_LOOPS);
perf_start(&start);
for (l = 0; l < TEST_LOOPS; l++) {
for (j = 0; j < TEST_SOURCES; j++) {
#if (VECT == 1)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
#elif (VECT == 2)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
#elif (VECT == 3)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
#elif (VECT == 4)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
#elif (VECT == 5)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
#elif (VECT == 6)
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
gf_vect_mul_init(gf[5][j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
#endif
}
#if (VECT == 1)
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, 0, g_tbls, buffs[0],
*dest_perf_ptrs);
#else
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, 0, g_tbls, buffs[0],
dest_perf_ptrs);
#endif
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) "_single_src" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (1 + VECT) * TEST_LOOPS);
perf_start(&start);
for (l = 0; l < TEST_LOOPS; l++) {
#if (VECT == 1)
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, 0, g_tbls, buffs[0],
*dest_perf_ptrs);
#else
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, 0, g_tbls, buffs[0],
dest_perf_ptrs);
#endif
}
perf_stop(&stop);
printf(xstr(FUNCTION_UNDER_TEST) "_single_src_simple" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * (1 + VECT) * TEST_LOOPS);
printf("pass perf check\n");
return 0;
}

View File

@ -0,0 +1,197 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg0.w ecx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define arg4 r12
%define arg5 r15
%define tmp r11
%define return rax
%define return.w eax
%define PS 8
%define stack_size 16*3 + 3*8
%define arg(x) [rsp + stack_size + PS + PS*x]
%define func(x) proc_frame x
%macro FUNC_SAVE 0
sub rsp, stack_size
movdqa [rsp+16*0],xmm6
movdqa [rsp+16*1],xmm7
movdqa [rsp+16*2],xmm8
save_reg r12, 3*16 + 0*8
save_reg r15, 3*16 + 1*8
end_prolog
mov arg4, arg(4)
mov arg5, arg(5)
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp+16*0]
movdqa xmm7, [rsp+16*1]
movdqa xmm8, [rsp+16*2]
mov r12, [rsp + 3*16 + 0*8]
mov r15, [rsp + 3*16 + 1*8]
add rsp, stack_size
%endmacro
%elifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg0.w edi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define return rax
%define return.w eax
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%endif
;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
%define len arg0
%define len.w arg0.w
%define vec arg1
%define vec_i arg2
%define mul_array arg3
%define src arg4
%define dest arg5
%define pos return
%define pos.w return.w
%ifndef EC_ALIGNED_ADDR
;;; Use Un-aligned load/store
%define XLDR movdqu
%define XSTR movdqu
%else
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR movdqa
%define XSTR movdqa
%else
%define XLDR movntdqa
%define XSTR movntdq
%endif
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm8
%define xgft_lo xmm7
%define xgft_hi xmm6
%define x0 xmm0
%define xtmpa xmm1
%define xtmph xmm2
%define xtmpl xmm3
%define xd xmm4
%define xtmpd xmm5
align 16
global gf_vect_mad_sse:function
func(gf_vect_mad_sse)
FUNC_SAVE
sub len, 16
jl .return_fail
xor pos, pos
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
sal vec_i, 5 ;Multiply by 32
movdqu xgft_lo, [vec_i+mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
movdqu xgft_hi, [vec_i+mul_array+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
XLDR xtmpd, [dest+len] ;backup the last 16 bytes in dest
.loop16:
XLDR xd, [dest+pos] ;Get next dest vector
.loop16_overlap:
XLDR x0, [src+pos] ;Get next source vector
movdqa xtmph, xgft_hi ;Reload const array registers
movdqa xtmpl, xgft_lo
movdqa xtmpa, x0 ;Keep unshifted copy of src
psraw x0, 4 ;Shift to put high nibble into bits 4-0
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
pshufb xtmph, x0 ;Lookup mul table of high nibble
pshufb xtmpl, xtmpa ;Lookup mul table of low nibble
pxor xtmph, xtmpl ;GF add high and low partials
pxor xd, xtmph
XSTR [dest+pos], xd ;Store result
add pos, 16 ;Loop on 16 bytes at a time
cmp pos, len
jle .loop16
lea tmp, [len + 16]
cmp pos, tmp
je .return_pass
;; Tail len
mov pos, len ;Overlapped offset length-16
movdqa xd, xtmpd ;Restore xd
jmp .loop16_overlap ;Do one more overlap pass
.return_pass:
mov return, 0
FUNC_RESTORE
ret
.return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_vect_mad_sse, 00, 01, 0200

View File

@ -0,0 +1,508 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset, memcmp
#include "erasure_code.h"
#include "types.h"
#ifndef ALIGN_SIZE
# define ALIGN_SIZE 16
#endif
//By default, test sse version
#ifndef FUNCTION_UNDER_TEST
# define FUNCTION_UNDER_TEST gf_6vect_mad_sse
# define REF_FUNCTION gf_6vect_dot_prod_sse
# define VECT 6
#endif
#ifndef TEST_MIN_SIZE
# define TEST_MIN_SIZE ALIGN_SIZE
#endif
#define str(s) #s
#define xstr(s) str(s)
#define TEST_LEN 8192
#define TEST_SIZE (TEST_LEN/2)
#define TEST_MEM TEST_SIZE
#define TEST_LOOPS 20000
#define TEST_TYPE_STR ""
#ifndef TEST_SOURCES
# define TEST_SOURCES 16
#endif
#ifndef RANDOMS
# define RANDOMS 20
#endif
#ifdef EC_ALIGNED_ADDR
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B 0
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
#else
// Define power of 2 range to check ptr, len alignment
# define PTR_ALIGN_CHK_B ALIGN_SIZE
# define LEN_ALIGN_CHK_B ALIGN_SIZE // 0 for aligned only
#endif
#define str(s) #s
#define xstr(s) str(s)
typedef unsigned char u8;
void dump(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len;) {
printf(" %2x", 0xff & buf[i++]);
if (i % 32 == 0)
printf("\n");
}
printf("\n");
}
void dump_matrix(unsigned char **s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", s[i][j]);
}
printf("\n");
}
printf("\n");
}
void dump_u8xu8(unsigned char *s, int k, int m)
{
int i, j;
for (i = 0; i < k; i++) {
for (j = 0; j < m; j++) {
printf(" %2x", 0xff & s[j + (i * m)]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i, j, rtest, srcs;
void *buf;
u8 gf[6][TEST_SOURCES];
u8 *g_tbls;
u8 *dest_ref[VECT];
u8 *dest_ptrs[VECT], *buffs[TEST_SOURCES];
int vector = VECT;
int align, size;
unsigned char *efence_buffs[TEST_SOURCES];
unsigned int offset;
u8 *ubuffs[TEST_SOURCES];
u8 *udest_ptrs[VECT];
printf("test" xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
// Allocate the arrays
for (i = 0; i < TEST_SOURCES; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
buffs[i] = buf;
}
if (posix_memalign(&buf, 16, 2 * (vector * TEST_SOURCES * 32))) {
printf("alloc error: Fail");
return -1;
}
g_tbls = buf;
for (i = 0; i < vector; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ptrs[i] = buf;
memset(dest_ptrs[i], 0, TEST_LEN);
}
for (i = 0; i < vector; i++) {
if (posix_memalign(&buf, 64, TEST_LEN)) {
printf("alloc error: Fail");
return -1;
}
dest_ref[i] = buf;
memset(dest_ref[i], 0, TEST_LEN);
}
// Test of all zeros
for (i = 0; i < TEST_SOURCES; i++)
memset(buffs[i], 0, TEST_LEN);
switch (vector) {
case 6:
memset(gf[5], 0xe6, TEST_SOURCES);
case 5:
memset(gf[4], 4, TEST_SOURCES);
case 4:
memset(gf[3], 9, TEST_SOURCES);
case 3:
memset(gf[2], 7, TEST_SOURCES);
case 2:
memset(gf[1], 1, TEST_SOURCES);
case 1:
memset(gf[0], 2, TEST_SOURCES);
break;
default:
return -1;
}
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < vector; i++)
for (j = 0; j < TEST_SOURCES; j++) {
gf[i][j] = rand();
gf_vect_mul_init(gf[i][j], &g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
}
for (i = 0; i < vector; i++)
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[i * 32 * TEST_SOURCES],
buffs, dest_ref[i]);
for (i = 0; i < vector; i++)
memset(dest_ptrs[i], 0, TEST_LEN);
for (i = 0; i < TEST_SOURCES; i++) {
#if (VECT == 1)
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], *dest_ptrs);
#else
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], dest_ptrs);
#endif
}
for (i = 0; i < vector; i++) {
if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test%d\n", i);
dump_matrix(buffs, vector, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref[i], 25);
printf("dprod_dut:");
dump(dest_ptrs[i], 25);
return -1;
}
}
#if (VECT == 1)
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, *dest_ref);
#else
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ref);
#endif
for (i = 0; i < vector; i++) {
if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test%d\n", i);
dump_matrix(buffs, vector, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref[i], 25);
printf("dprod_dut:");
dump(dest_ptrs[i], 25);
return -1;
}
}
putchar('.');
// Rand data test
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < vector; i++)
for (j = 0; j < TEST_SOURCES; j++) {
gf[i][j] = rand();
gf_vect_mul_init(gf[i][j],
&g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
}
for (i = 0; i < vector; i++)
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES,
&g_tbls[i * 32 * TEST_SOURCES], buffs,
dest_ref[i]);
for (i = 0; i < vector; i++)
memset(dest_ptrs[i], 0, TEST_LEN);
for (i = 0; i < TEST_SOURCES; i++) {
#if (VECT == 1)
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
*dest_ptrs);
#else
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
dest_ptrs);
#endif
}
for (i = 0; i < vector; i++) {
if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test%d %d\n",
i, rtest);
dump_matrix(buffs, vector, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref[i], 25);
printf("dprod_dut:");
dump(dest_ptrs[i], 25);
return -1;
}
}
putchar('.');
}
// Rand data test with varied parameters
for (rtest = 0; rtest < RANDOMS; rtest++) {
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
for (i = 0; i < srcs; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < vector; i++)
for (j = 0; j < srcs; j++) {
gf[i][j] = rand();
gf_vect_mul_init(gf[i][j],
&g_tbls[i * (32 * srcs) + j * 32]);
}
for (i = 0; i < vector; i++)
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[i * 32 * srcs],
buffs, dest_ref[i]);
for (i = 0; i < vector; i++)
memset(dest_ptrs[i], 0, TEST_LEN);
for (i = 0; i < srcs; i++) {
#if (VECT == 1)
FUNCTION_UNDER_TEST(TEST_LEN, srcs, i, g_tbls, buffs[i],
*dest_ptrs);
#else
FUNCTION_UNDER_TEST(TEST_LEN, srcs, i, g_tbls, buffs[i],
dest_ptrs);
#endif
}
for (i = 0; i < vector; i++) {
if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test%d srcs=%d\n", i, srcs);
dump_matrix(buffs, vector, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref[i], 25);
printf("dprod_dut:");
dump(dest_ptrs[i], 25);
return -1;
}
}
putchar('.');
}
}
// Run tests at end of buffer for Electric Fence
align = (LEN_ALIGN_CHK_B != 0) ? 1 : ALIGN_SIZE;
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < TEST_LEN; j++)
buffs[i][j] = rand();
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
efence_buffs[i] = buffs[i] + TEST_LEN - size;
for (i = 0; i < vector; i++)
for (j = 0; j < TEST_SOURCES; j++) {
gf[i][j] = rand();
gf_vect_mul_init(gf[i][j],
&g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
}
for (i = 0; i < vector; i++)
gf_vect_dot_prod_base(size, TEST_SOURCES,
&g_tbls[i * 32 * TEST_SOURCES], efence_buffs,
dest_ref[i]);
for (i = 0; i < vector; i++)
memset(dest_ptrs[i], 0, size);
for (i = 0; i < TEST_SOURCES; i++) {
#if (VECT == 1)
FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, efence_buffs[i],
*dest_ptrs);
#else
FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, efence_buffs[i],
dest_ptrs);
#endif
}
for (i = 0; i < vector; i++) {
if (0 != memcmp(dest_ref[i], dest_ptrs[i], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test%d size=%d\n", i, size);
dump_matrix(buffs, vector, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref[i], TEST_MIN_SIZE + align);
printf("dprod_dut:");
dump(dest_ptrs[i], TEST_MIN_SIZE + align);
return -1;
}
}
putchar('.');
}
// Test rand ptr alignment if available
for (rtest = 0; rtest < RANDOMS; rtest++) {
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
srcs = rand() % TEST_SOURCES;
if (srcs == 0)
continue;
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
// Add random offsets
for (i = 0; i < srcs; i++)
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
for (i = 0; i < vector; i++) {
udest_ptrs[i] = dest_ptrs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
memset(dest_ptrs[i], 0, TEST_LEN); // zero pad to check write-over
}
for (i = 0; i < srcs; i++)
for (j = 0; j < size; j++)
ubuffs[i][j] = rand();
for (i = 0; i < vector; i++)
for (j = 0; j < srcs; j++) {
gf[i][j] = rand();
gf_vect_mul_init(gf[i][j], &g_tbls[i * (32 * srcs) + j * 32]);
}
for (i = 0; i < vector; i++)
gf_vect_dot_prod_base(size, srcs, &g_tbls[i * 32 * srcs], ubuffs,
dest_ref[i]);
for (i = 0; i < srcs; i++) {
#if (VECT == 1)
FUNCTION_UNDER_TEST(size, srcs, i, g_tbls, ubuffs[i], *udest_ptrs);
#else
FUNCTION_UNDER_TEST(size, srcs, i, g_tbls, ubuffs[i], udest_ptrs);
#endif
}
for (i = 0; i < vector; i++) {
if (0 != memcmp(dest_ref[i], udest_ptrs[i], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test%d ualign srcs=%d\n", i, srcs);
dump_matrix(buffs, vector, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref[i], 25);
printf("dprod_dut:");
dump(udest_ptrs[i], 25);
return -1;
}
}
// Confirm that padding around dests is unchanged
memset(dest_ref[0], 0, PTR_ALIGN_CHK_B); // Make reference zero buff
for (i = 0; i < vector; i++) {
offset = udest_ptrs[i] - dest_ptrs[i];
if (memcmp(dest_ptrs[i], dest_ref[0], offset)) {
printf("Fail rand ualign pad1 start\n");
return -1;
}
if (memcmp
(dest_ptrs[i] + offset + size, dest_ref[0],
PTR_ALIGN_CHK_B - offset)) {
printf("Fail rand ualign pad1 end\n");
return -1;
}
}
putchar('.');
}
// Test all size alignment
align = (LEN_ALIGN_CHK_B != 0) ? 1 : ALIGN_SIZE;
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
for (i = 0; i < TEST_SOURCES; i++)
for (j = 0; j < size; j++)
buffs[i][j] = rand();
for (i = 0; i < vector; i++) {
for (j = 0; j < TEST_SOURCES; j++) {
gf[i][j] = rand();
gf_vect_mul_init(gf[i][j],
&g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
}
memset(dest_ptrs[i], 0, TEST_LEN); // zero pad to check write-over
}
for (i = 0; i < vector; i++)
gf_vect_dot_prod_base(size, TEST_SOURCES,
&g_tbls[i * 32 * TEST_SOURCES], buffs,
dest_ref[i]);
for (i = 0; i < TEST_SOURCES; i++) {
#if (VECT == 1)
FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, buffs[i],
*dest_ptrs);
#else
FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, buffs[i],
dest_ptrs);
#endif
}
for (i = 0; i < vector; i++) {
if (0 != memcmp(dest_ref[i], dest_ptrs[i], size)) {
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
" test%d ualign len=%d\n", i, size);
dump_matrix(buffs, vector, TEST_SOURCES);
printf("dprod_base:");
dump(dest_ref[i], 25);
printf("dprod_dut:");
dump(dest_ptrs[i], 25);
return -1;
}
}
putchar('.');
}
printf("Pass\n");
return 0;
}

View File

@ -0,0 +1,164 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_vect_mul_avx(len, mul_array, src, dest)
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define return rax
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%elifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define return rax
%define stack_size 5*16 + 8 ; must be an odd multiple of 8
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
save_xmm128 xmm6, 0*16
save_xmm128 xmm7, 1*16
save_xmm128 xmm13, 2*16
save_xmm128 xmm14, 3*16
save_xmm128 xmm15, 4*16
end_prolog
%endmacro
%macro FUNC_RESTORE 0
vmovdqa xmm6, [rsp + 0*16]
vmovdqa xmm7, [rsp + 1*16]
vmovdqa xmm13, [rsp + 2*16]
vmovdqa xmm14, [rsp + 3*16]
vmovdqa xmm15, [rsp + 4*16]
add rsp, stack_size
%endmacro
%endif
%define len arg0
%define mul_array arg1
%define src arg2
%define dest arg3
%define pos return
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR vmovdqa
%define XSTR vmovdqa
%else
%define XLDR vmovntdqa
%define XSTR vmovntdq
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft_lo xmm14
%define xgft_hi xmm13
%define x0 xmm0
%define xtmp1a xmm1
%define xtmp1b xmm2
%define xtmp1c xmm3
%define x1 xmm4
%define xtmp2a xmm5
%define xtmp2b xmm6
%define xtmp2c xmm7
align 16
global gf_vect_mul_avx:function
func(gf_vect_mul_avx)
FUNC_SAVE
mov pos, 0
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
vmovdqu xgft_lo, [mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
vmovdqu xgft_hi, [mul_array+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
loop32:
XLDR x0, [src+pos] ;Get next source vector
XLDR x1, [src+pos+16] ;Get next source vector + 16B ahead
add pos, 32 ;Loop on 16 bytes at a time
cmp pos, len
vpand xtmp1a, x0, xmask0f ;Mask low src nibble in bits 4-0
vpand xtmp2a, x1, xmask0f
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
vpsraw x1, x1, 4
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
vpand x1, x1, xmask0f
vpshufb xtmp1b, xgft_hi, x0 ;Lookup mul table of high nibble
vpshufb xtmp1c, xgft_lo, xtmp1a ;Lookup mul table of low nibble
vpshufb xtmp2b, xgft_hi, x1 ;Lookup mul table of high nibble
vpshufb xtmp2c, xgft_lo, xtmp2a ;Lookup mul table of low nibble
vpxor xtmp1b, xtmp1b, xtmp1c ;GF add high and low partials
vpxor xtmp2b, xtmp2b, xtmp2c
XSTR [dest+pos-32], xtmp1b ;Store result
XSTR [dest+pos-16], xtmp2b ;Store +16B result
jl loop32
return_pass:
FUNC_RESTORE
sub pos, len
ret
return_fail:
FUNC_RESTORE
mov return, 1
ret
endproc_frame
section .data
align 16
mask0f:
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_vect_mul_avx, 01, 03, 0036

View File

@ -0,0 +1,99 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset
#include "erasure_code.h"
#include "test.h"
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_LEN 8*1024
# define TEST_LOOPS 4000000
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 10
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN GT_L3_CACHE / 2
# define TEST_LOOPS 1000
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS 1000
# endif
# endif
#endif
#define TEST_MEM (2 * TEST_LEN)
typedef unsigned char u8;
int main(int argc, char *argv[])
{
int i;
u8 *buff1, *buff2, gf_const_tbl[64], a = 2;
struct perf start, stop;
printf("gf_vect_mul_avx_perf:\n");
gf_vect_mul_init(a, gf_const_tbl);
// Allocate large mem region
buff1 = (u8 *) malloc(TEST_LEN);
buff2 = (u8 *) malloc(TEST_LEN);
if (NULL == buff1 || NULL == buff2) {
printf("Failed to allocate %dB\n", TEST_LEN);
return 1;
}
memset(buff1, 0, TEST_LEN);
memset(buff2, 0, TEST_LEN);
gf_vect_mul_avx(TEST_LEN, gf_const_tbl, buff1, buff2);
printf("Start timed tests\n");
fflush(0);
gf_vect_mul_avx(TEST_LEN, gf_const_tbl, buff1, buff2);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
gf_vect_mul_init(a, gf_const_tbl);
gf_vect_mul_avx(TEST_LEN, gf_const_tbl, buff1, buff2);
}
perf_stop(&stop);
printf("gf_vect_mul_avx" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * i);
return 0;
}

View File

@ -0,0 +1,143 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset
#include "erasure_code.h"
#define TEST_SIZE 8192
#define TEST_MEM TEST_SIZE
#define TEST_LOOPS 100000
#define TEST_TYPE_STR ""
typedef unsigned char u8;
int main(int argc, char *argv[])
{
int i;
u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
int align, size;
unsigned char *efence_buff1;
unsigned char *efence_buff2;
unsigned char *efence_buff3;
printf("gf_vect_mul_avx:\n");
gf_vect_mul_init(a, gf_const_tbl);
buff1 = (u8 *) malloc(TEST_SIZE);
buff2 = (u8 *) malloc(TEST_SIZE);
buff3 = (u8 *) malloc(TEST_SIZE);
if (NULL == buff1 || NULL == buff2 || NULL == buff3) {
printf("buffer alloc error\n");
return -1;
}
// Fill with rand data
for (i = 0; i < TEST_SIZE; i++)
buff1[i] = rand();
gf_vect_mul_avx(TEST_SIZE, gf_const_tbl, buff1, buff2);
for (i = 0; i < TEST_SIZE; i++)
if (gf_mul(a, buff1[i]) != buff2[i]) {
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i, buff1[i], buff2[i],
gf_mul(2, buff1[i]));
return 1;
}
gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
// Check reference function
for (i = 0; i < TEST_SIZE; i++)
if (buff2[i] != buff3[i]) {
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
return 1;
}
for (i = 0; i < TEST_SIZE; i++)
buff1[i] = rand();
// Check each possible constant
printf("Random tests ");
for (a = 0; a != 255; a++) {
gf_vect_mul_init(a, gf_const_tbl);
gf_vect_mul_avx(TEST_SIZE, gf_const_tbl, buff1, buff2);
for (i = 0; i < TEST_SIZE; i++)
if (gf_mul(a, buff1[i]) != buff2[i]) {
printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
return 1;
}
putchar('.');
}
// Run tests at end of buffer for Electric Fence
align = 32;
a = 2;
gf_vect_mul_init(a, gf_const_tbl);
for (size = 0; size < TEST_SIZE; size += align) {
// Line up TEST_SIZE from end
efence_buff1 = buff1 + size;
efence_buff2 = buff2 + size;
efence_buff3 = buff3 + size;
gf_vect_mul_avx(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2);
for (i = 0; i < TEST_SIZE - size; i++)
if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
i, efence_buff1[i], efence_buff2[i], gf_mul(2,
efence_buff1
[i]));
return 1;
}
gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff3);
// Check reference function
for (i = 0; i < TEST_SIZE - size; i++)
if (efence_buff2[i] != efence_buff3[i]) {
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
i, a, efence_buff2[i], efence_buff3[i], gf_mul(2,
efence_buff1
[i]));
return 1;
}
putchar('.');
}
printf(" done: Pass\n");
return 0;
}

View File

@ -0,0 +1,129 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset
#include "erasure_code.h"
#define TEST_SIZE 8192
#define TEST_MEM TEST_SIZE
#define TEST_LOOPS 100000
#define TEST_TYPE_STR ""
typedef unsigned char u8;
int main(int argc, char *argv[])
{
int i;
u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
int align, size;
unsigned char *efence_buff1;
unsigned char *efence_buff2;
printf("gf_vect_mul_base_test:\n");
gf_vect_mul_init(a, gf_const_tbl);
buff1 = (u8 *) malloc(TEST_SIZE);
buff2 = (u8 *) malloc(TEST_SIZE);
buff3 = (u8 *) malloc(TEST_SIZE);
if (NULL == buff1 || NULL == buff2 || NULL == buff3) {
printf("buffer alloc error\n");
return -1;
}
// Fill with rand data
for (i = 0; i < TEST_SIZE; i++)
buff1[i] = rand();
gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2);
for (i = 0; i < TEST_SIZE; i++)
if (gf_mul(a, buff1[i]) != buff2[i]) {
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i, buff1[i], buff2[i],
gf_mul(2, buff1[i]));
return 1;
}
gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
// Check reference function
for (i = 0; i < TEST_SIZE; i++)
if (buff2[i] != buff3[i]) {
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
return 1;
}
for (i = 0; i < TEST_SIZE; i++)
buff1[i] = rand();
// Check each possible constant
printf("Random tests ");
for (a = 0; a != 255; a++) {
gf_vect_mul_init(a, gf_const_tbl);
gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2);
for (i = 0; i < TEST_SIZE; i++)
if (gf_mul(a, buff1[i]) != buff2[i]) {
printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
return 1;
}
putchar('.');
}
// Run tests at end of buffer for Electric Fence
align = 32;
a = 2;
gf_vect_mul_init(a, gf_const_tbl);
for (size = 0; size < TEST_SIZE; size += align) {
// Line up TEST_SIZE from end
efence_buff1 = buff1 + size;
efence_buff2 = buff2 + size;
gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2);
for (i = 0; i < TEST_SIZE - size; i++)
if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
i, efence_buff1[i], efence_buff2[i], gf_mul(2,
efence_buff1
[i]));
return 1;
}
putchar('.');
}
printf(" done: Pass\n");
return 0;
}

View File

@ -0,0 +1,99 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset
#include "erasure_code.h"
#include "test.h"
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_LEN 8*1024
# define TEST_LOOPS 4000000
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 10
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN GT_L3_CACHE / 2
# define TEST_LOOPS 1000
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS 1000
# endif
# endif
#endif
#define TEST_MEM (2 * TEST_LEN)
typedef unsigned char u8;
int main(int argc, char *argv[])
{
int i;
u8 *buff1, *buff2, gf_const_tbl[64], a = 2;
struct perf start, stop;
printf("gf_vect_mul_perf:\n");
gf_vect_mul_init(a, gf_const_tbl);
// Allocate large mem region
buff1 = (u8 *) malloc(TEST_LEN);
buff2 = (u8 *) malloc(TEST_LEN);
if (NULL == buff1 || NULL == buff2) {
printf("Failed to allocate %dB\n", TEST_LEN);
return 1;
}
memset(buff1, 0, TEST_LEN);
memset(buff2, 0, TEST_LEN);
gf_vect_mul(TEST_LEN, gf_const_tbl, buff1, buff2);
printf("Start timed tests\n");
fflush(0);
gf_vect_mul(TEST_LEN, gf_const_tbl, buff1, buff2);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
gf_vect_mul_init(a, gf_const_tbl);
gf_vect_mul(TEST_LEN, gf_const_tbl, buff1, buff2);
}
perf_stop(&stop);
printf("gf_vect_mul" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * i);
return 0;
}

View File

@ -0,0 +1,170 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; gf_vect_mul_sse(len, mul_array, src, dest)
;;;
%include "reg_sizes.asm"
%ifidn __OUTPUT_FORMAT__, elf64
%define arg0 rdi
%define arg1 rsi
%define arg2 rdx
%define arg3 rcx
%define arg4 r8
%define arg5 r9
%define tmp r11
%define return rax
%define func(x) x:
%define FUNC_SAVE
%define FUNC_RESTORE
%elifidn __OUTPUT_FORMAT__, win64
%define arg0 rcx
%define arg1 rdx
%define arg2 r8
%define arg3 r9
%define return rax
%define stack_size 5*16 + 8 ; must be an odd multiple of 8
%define func(x) proc_frame x
%macro FUNC_SAVE 0
alloc_stack stack_size
save_xmm128 xmm6, 0*16
save_xmm128 xmm7, 1*16
save_xmm128 xmm13, 2*16
save_xmm128 xmm14, 3*16
save_xmm128 xmm15, 4*16
end_prolog
%endmacro
%macro FUNC_RESTORE 0
movdqa xmm6, [rsp + 0*16]
movdqa xmm7, [rsp + 1*16]
movdqa xmm13, [rsp + 2*16]
movdqa xmm14, [rsp + 3*16]
movdqa xmm15, [rsp + 4*16]
add rsp, stack_size
%endmacro
%endif
%define len arg0
%define mul_array arg1
%define src arg2
%define dest arg3
%define pos return
;;; Use Non-temporal load/stor
%ifdef NO_NT_LDST
%define XLDR movdqa
%define XSTR movdqa
%else
%define XLDR movntdqa
%define XSTR movntdq
%endif
default rel
[bits 64]
section .text
%define xmask0f xmm15
%define xgft_lo xmm14
%define xgft_hi xmm13
%define x0 xmm0
%define xtmp1a xmm1
%define xtmp1b xmm2
%define xtmp1c xmm3
%define x1 xmm4
%define xtmp2a xmm5
%define xtmp2b xmm6
%define xtmp2c xmm7
align 16
global gf_vect_mul_sse:function
func(gf_vect_mul_sse)
FUNC_SAVE
mov pos, 0
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
movdqu xgft_lo, [mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
movdqu xgft_hi, [mul_array+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
loop32:
XLDR x0, [src+pos] ;Get next source vector
XLDR x1, [src+pos+16] ;Get next source vector + 16B ahead
movdqa xtmp1b, xgft_hi ;Reload const array registers
movdqa xtmp1c, xgft_lo
movdqa xtmp2b, xgft_hi
movdqa xtmp2c, xgft_lo
movdqa xtmp1a, x0 ;Keep unshifted copy of src
movdqa xtmp2a, x1
psraw x0, 4 ;Shift to put high nibble into bits 4-0
psraw x1, 4
pand xtmp1a, xmask0f ;Mask low src nibble in bits 4-0
pand xtmp2a, xmask0f
pand x0, xmask0f ;Mask high src nibble in bits 4-0
pand x1, xmask0f
pshufb xtmp1b, x0 ;Lookup mul table of high nibble
pshufb xtmp1c, xtmp1a ;Lookup mul table of low nibble
pshufb xtmp2b, x1
pshufb xtmp2c, xtmp2a
pxor xtmp1b, xtmp1c ;GF add high and low partials
pxor xtmp2b, xtmp2c
XSTR [dest+pos], xtmp1b ;Store result
XSTR [dest+pos+16], xtmp2b ;Store +16B result
add pos, 32 ;Loop on 32 bytes at at time
cmp pos, len
jl loop32
return_pass:
sub pos, len
FUNC_RESTORE
ret
return_fail:
mov return, 1
FUNC_RESTORE
ret
endproc_frame
section .data
align 16
mask0f:
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
;;; func core, ver, snum
slversion gf_vect_mul_sse, 00, 03, 0034

View File

@ -0,0 +1,97 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset
#include "erasure_code.h"
#include "test.h"
//#define CACHED_TEST
#ifdef CACHED_TEST
// Cached test, loop many times over small dataset
# define TEST_LEN 8*1024
# define TEST_LOOPS 4000000
# define TEST_TYPE_STR "_warm"
#else
# ifndef TEST_CUSTOM
// Uncached test. Pull from large mem base.
# define TEST_SOURCES 10
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
# define TEST_LEN GT_L3_CACHE / 2
# define TEST_LOOPS 1000
# define TEST_TYPE_STR "_cold"
# else
# define TEST_TYPE_STR "_cus"
# ifndef TEST_LOOPS
# define TEST_LOOPS 1000
# endif
# endif
#endif
#define TEST_MEM (2 * TEST_LEN)
typedef unsigned char u8;
int main(int argc, char *argv[])
{
int i;
u8 *buff1, *buff2, gf_const_tbl[64], a = 2;
struct perf start, stop;
printf("gf_vect_mul_sse_perf:\n");
gf_vect_mul_init(a, gf_const_tbl);
// Allocate large mem region
buff1 = (u8 *) malloc(TEST_LEN);
buff2 = (u8 *) malloc(TEST_LEN);
if (NULL == buff1 || NULL == buff2) {
printf("Failed to allocate %dB\n", TEST_LEN);
return 1;
}
memset(buff1, 0, TEST_LEN);
memset(buff2, 0, TEST_LEN);
printf("Start timed tests\n");
fflush(0);
gf_vect_mul_sse(TEST_LEN, gf_const_tbl, buff1, buff2);
perf_start(&start);
for (i = 0; i < TEST_LOOPS; i++) {
gf_vect_mul_init(a, gf_const_tbl); // in a re-build would only calc once
gf_vect_mul_sse(TEST_LEN, gf_const_tbl, buff1, buff2);
}
perf_stop(&stop);
printf("gf_vect_mul_sse" TEST_TYPE_STR ": ");
perf_print(stop, start, (long long)TEST_LEN * i);
return 0;
}

View File

@ -0,0 +1,160 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include "erasure_code.h"
#define TEST_SIZE (128*1024)
typedef unsigned char u8;
int main(int argc, char *argv[])
{
int i;
u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
int tsize;
int align, size;
unsigned char *efence_buff1;
unsigned char *efence_buff2;
unsigned char *efence_buff3;
printf("gf_vect_mul_sse_test: ");
gf_vect_mul_init(a, gf_const_tbl);
buff1 = (u8 *) malloc(TEST_SIZE);
buff2 = (u8 *) malloc(TEST_SIZE);
buff3 = (u8 *) malloc(TEST_SIZE);
if (NULL == buff1 || NULL == buff2 || NULL == buff3) {
printf("buffer alloc error\n");
return -1;
}
// Fill with rand data
for (i = 0; i < TEST_SIZE; i++)
buff1[i] = rand();
gf_vect_mul_sse(TEST_SIZE, gf_const_tbl, buff1, buff2);
for (i = 0; i < TEST_SIZE; i++) {
if (gf_mul(a, buff1[i]) != buff2[i]) {
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i,
buff1[i], buff2[i], gf_mul(2, buff1[i]));
return -1;
}
}
gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
// Check reference function
for (i = 0; i < TEST_SIZE; i++) {
if (buff2[i] != buff3[i]) {
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
return -1;
}
}
for (i = 0; i < TEST_SIZE; i++)
buff1[i] = rand();
// Check each possible constant
for (a = 0; a != 255; a++) {
gf_vect_mul_init(a, gf_const_tbl);
gf_vect_mul_sse(TEST_SIZE, gf_const_tbl, buff1, buff2);
for (i = 0; i < TEST_SIZE; i++)
if (gf_mul(a, buff1[i]) != buff2[i]) {
printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
return -1;
}
putchar('.');
}
// Check buffer len
for (tsize = TEST_SIZE; tsize > 0; tsize -= 32) {
a = rand();
gf_vect_mul_init(a, gf_const_tbl);
gf_vect_mul_sse(tsize, gf_const_tbl, buff1, buff2);
for (i = 0; i < tsize; i++)
if (gf_mul(a, buff1[i]) != buff2[i]) {
printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
return -1;
}
if (0 == tsize % (32 * 8)) {
putchar('.');
fflush(0);
}
}
// Run tests at end of buffer for Electric Fence
align = 32;
a = 2;
gf_vect_mul_init(a, gf_const_tbl);
for (size = 0; size < TEST_SIZE; size += align) {
// Line up TEST_SIZE from end
efence_buff1 = buff1 + size;
efence_buff2 = buff2 + size;
efence_buff3 = buff3 + size;
gf_vect_mul_sse(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2);
for (i = 0; i < TEST_SIZE - size; i++)
if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
i, efence_buff1[i], efence_buff2[i], gf_mul(2,
efence_buff1
[i]));
return 1;
}
gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff3);
// Check reference function
for (i = 0; i < TEST_SIZE - size; i++)
if (efence_buff2[i] != efence_buff3[i]) {
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
i, a, efence_buff2[i], efence_buff3[i], gf_mul(2,
efence_buff1
[i]));
return 1;
}
putchar('.');
}
printf(" done: Pass\n");
fflush(0);
return 0;
}

View File

@ -0,0 +1,142 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> // for memset
#include "erasure_code.h"
#define TEST_SIZE 8192
#define TEST_MEM TEST_SIZE
#define TEST_LOOPS 100000
#define TEST_TYPE_STR ""
typedef unsigned char u8;
int main(int argc, char *argv[])
{
int i;
u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
int align, size;
unsigned char *efence_buff1;
unsigned char *efence_buff2;
unsigned char *efence_buff3;
printf("gf_vect_mul_test:\n");
gf_vect_mul_init(a, gf_const_tbl);
buff1 = (u8 *) malloc(TEST_SIZE);
buff2 = (u8 *) malloc(TEST_SIZE);
buff3 = (u8 *) malloc(TEST_SIZE);
if (NULL == buff1 || NULL == buff2 || NULL == buff3) {
printf("buffer alloc error\n");
return -1;
}
// Fill with rand data
for (i = 0; i < TEST_SIZE; i++)
buff1[i] = rand();
gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2);
for (i = 0; i < TEST_SIZE; i++)
if (gf_mul(a, buff1[i]) != buff2[i]) {
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i, buff1[i], buff2[i],
gf_mul(2, buff1[i]));
return 1;
}
gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
// Check reference function
for (i = 0; i < TEST_SIZE; i++)
if (buff2[i] != buff3[i]) {
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
return 1;
}
for (i = 0; i < TEST_SIZE; i++)
buff1[i] = rand();
// Check each possible constant
printf("Random tests ");
for (a = 0; a != 255; a++) {
gf_vect_mul_init(a, gf_const_tbl);
gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2);
for (i = 0; i < TEST_SIZE; i++) {
if (gf_mul(a, buff1[i]) != buff2[i]) {
printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
return 1;
}
}
putchar('.');
}
// Run tests at end of buffer for Electric Fence
align = 32;
a = 2;
gf_vect_mul_init(a, gf_const_tbl);
for (size = 0; size < TEST_SIZE; size += align) {
// Line up TEST_SIZE from end
efence_buff1 = buff1 + size;
efence_buff2 = buff2 + size;
efence_buff3 = buff3 + size;
gf_vect_mul(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2);
for (i = 0; i < TEST_SIZE - size; i++)
if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
i, efence_buff1[i], efence_buff2[i],
gf_mul(2, efence_buff1[i]));
return 1;
}
gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff3);
// Check reference function
for (i = 0; i < TEST_SIZE - size; i++)
if (efence_buff2[i] != efence_buff3[i]) {
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
i, a, efence_buff2[i], efence_buff3[i],
gf_mul(2, efence_buff1[i]));
return 1;
}
putchar('.');
}
printf(" done: Pass\n");
return 0;
}

933
include/erasure_code.h Normal file
View File

@ -0,0 +1,933 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#ifndef _ERASURE_CODE_H_
#define _ERASURE_CODE_H_
/**
* @file erasure_code.h
* @brief Interface to functions supporting erasure code encode and decode.
*
* This file defines the interface to optimized functions used in erasure
* codes. Encode and decode of erasures in GF(2^8) are made by calculating the
* dot product of the symbols (bytes in GF(2^8)) across a set of buffers and a
* set of coefficients. Values for the coefficients are determined by the type
* of erasure code. Using a general dot product means that any sequence of
* coefficients may be used including erasure codes based on random
* coefficients.
* Multiple versions of dot product are supplied to calculate 1-6 output
* vectors in one pass.
* Base GF multiply and divide functions can be sped up by defining
* GF_LARGE_TABLES at the expense of memory size.
*
*/
#include "gf_vect_mul.h"
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Initialize tables for fast Erasure Code encode and decode.
*
* Generates the expanded tables needed for fast encode or decode for erasure
* codes on blocks of data. 32bytes is generated for each input coefficient.
*
* @param k The number of vector sources or rows in the generator matrix
* for coding.
* @param rows The number of output vectors to concurrently encode/decode.
* @param a Pointer to sets of arrays of input coefficients used to encode
* or decode data.
* @param gftbls Pointer to start of space for concatenated output tables
* generated from input coefficients. Must be of size 32*k*rows.
* @returns none
*/
void ec_init_tables(int k, int rows, unsigned char* a, unsigned char* gftbls);
/**
* @brief Generate or decode erasure codes on blocks of data, runs appropriate version.
*
* Given a list of source data blocks, generate one or multiple blocks of
* encoded data as specified by a matrix of GF(2^8) coefficients. When given a
* suitable set of coefficients, this function will perform the fast generation
* or decoding of Reed-Solomon type erasure codes.
*
* This function determines what instruction sets are enabled and
* selects the appropriate version at runtime.
*
* @param len Length of each block of data (vector) of source or dest data.
* @param k The number of vector sources or rows in the generator matrix
* for coding.
* @param rows The number of output vectors to concurrently encode/decode.
* @param gftbls Pointer to array of input tables generated from coding
* coefficients in ec_init_tables(). Must be of size 32*k*rows
* @param data Array of pointers to source input buffers.
* @param coding Array of pointers to coded output buffers.
* @returns none
*/
void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
unsigned char **coding);
/**
* @brief Generate or decode erasure codes on blocks of data.
*
* Arch specific version of ec_encode_data() with same parameters.
* @requires SSE4.1
*/
void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
unsigned char **coding);
/**
* @brief Generate or decode erasure codes on blocks of data.
*
* Arch specific version of ec_encode_data() with same parameters.
* @requires AVX
*/
void ec_encode_data_avx(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
unsigned char **coding);
/**
* @brief Generate or decode erasure codes on blocks of data.
*
* Arch specific version of ec_encode_data() with same parameters.
* @requires AVX2
*/
void ec_encode_data_avx2(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
unsigned char **coding);
/**
* @brief Generate or decode erasure codes on blocks of data, runs baseline version.
*
* Baseline version of ec_encode_data() with same parameters.
*/
void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src,
unsigned char **dest);
/**
* @brief Generate update for encode or decode of erasure codes from single source, runs appropriate version.
*
* Given one source data block, update one or multiple blocks of encoded data as
* specified by a matrix of GF(2^8) coefficients. When given a suitable set of
* coefficients, this function will perform the fast generation or decoding of
* Reed-Solomon type erasure codes from one input source at a time.
*
* This function determines what instruction sets are enabled and selects the
* appropriate version at runtime.
*
* @param len Length of each block of data (vector) of source or dest data.
* @param k The number of vector sources or rows in the generator matrix
* for coding.
* @param rows The number of output vectors to concurrently encode/decode.
* @param vec_i The vector index corresponding to the single input source.
* @param g_tbls Pointer to array of input tables generated from coding
* coefficients in ec_init_tables(). Must be of size 32*k*rows
* @param data Pointer to single input source used to update output parity.
* @param coding Array of pointers to coded output buffers.
* @returns none
*/
void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
unsigned char *data, unsigned char **coding);
/**
* @brief Generate update for encode or decode of erasure codes from single source.
*
* Arch specific version of ec_encode_data_update() with same parameters.
* @requires SSE4.1
*/
void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
unsigned char *data, unsigned char **coding);
/**
* @brief Generate update for encode or decode of erasure codes from single source.
*
* Arch specific version of ec_encode_data_update() with same parameters.
* @requires AVX
*/
void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
unsigned char *data, unsigned char **coding);
/**
* @brief Generate update for encode or decode of erasure codes from single source.
*
* Arch specific version of ec_encode_data_update() with same parameters.
* @requires AVX2
*/
void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
unsigned char *data, unsigned char **coding);
/**
* @brief Generate update for encode or decode of erasure codes from single source.
*
* Baseline version of ec_encode_data_update().
*/
void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v,
unsigned char *data, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product.
*
* Does a GF(2^8) dot product across each byte of the input array and a constant
* set of coefficients to produce each byte of the output. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 32*vlen byte constant array based on the input coefficients.
* @requires SSE4.1
*
* @param len Length of each vector in bytes. Must be >= 16.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
* on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Pointer to destination data array.
* @returns none
*/
void gf_vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char *dest);
/**
* @brief GF(2^8) vector dot product.
*
* Does a GF(2^8) dot product across each byte of the input array and a constant
* set of coefficients to produce each byte of the output. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 32*vlen byte constant array based on the input coefficients.
* @requires AVX
*
* @param len Length of each vector in bytes. Must be >= 16.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
* on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Pointer to destination data array.
* @returns none
*/
void gf_vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char *dest);
/**
* @brief GF(2^8) vector dot product.
*
* Does a GF(2^8) dot product across each byte of the input array and a constant
* set of coefficients to produce each byte of the output. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 32*vlen byte constant array based on the input coefficients.
* @requires AVX2
*
* @param len Length of each vector in bytes. Must be >= 32.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
* on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Pointer to destination data array.
* @returns none
*/
void gf_vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char *dest);
/**
* @brief GF(2^8) vector dot product with two outputs.
*
* Vector dot product optimized to calculate two ouputs at a time. Does two
* GF(2^8) dot products across each byte of the input array and two constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 2*32*vlen byte constant array based on the two sets of input coefficients.
* @requires SSE4.1
*
* @param len Length of each vector in bytes. Must be >= 16.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_2vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with two outputs.
*
* Vector dot product optimized to calculate two ouputs at a time. Does two
* GF(2^8) dot products across each byte of the input array and two constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 2*32*vlen byte constant array based on the two sets of input coefficients.
* @requires AVX
*
* @param len Length of each vector in bytes. Must be >= 16.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_2vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with two outputs.
*
* Vector dot product optimized to calculate two ouputs at a time. Does two
* GF(2^8) dot products across each byte of the input array and two constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 2*32*vlen byte constant array based on the two sets of input coefficients.
* @requires AVX2
*
* @param len Length of each vector in bytes. Must be >= 32.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_2vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with three outputs.
*
* Vector dot product optimized to calculate three ouputs at a time. Does three
* GF(2^8) dot products across each byte of the input array and three constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 3*32*vlen byte constant array based on the three sets of input coefficients.
* @requires SSE4.1
*
* @param len Length of each vector in bytes. Must be >= 16.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_3vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with three outputs.
*
* Vector dot product optimized to calculate three ouputs at a time. Does three
* GF(2^8) dot products across each byte of the input array and three constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 3*32*vlen byte constant array based on the three sets of input coefficients.
* @requires AVX
*
* @param len Length of each vector in bytes. Must be >= 16.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_3vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with three outputs.
*
* Vector dot product optimized to calculate three ouputs at a time. Does three
* GF(2^8) dot products across each byte of the input array and three constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 3*32*vlen byte constant array based on the three sets of input coefficients.
* @requires AVX2
*
* @param len Length of each vector in bytes. Must be >= 32.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_3vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with four outputs.
*
* Vector dot product optimized to calculate four ouputs at a time. Does four
* GF(2^8) dot products across each byte of the input array and four constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 4*32*vlen byte constant array based on the four sets of input coefficients.
* @requires SSE4.1
*
* @param len Length of each vector in bytes. Must be >= 16.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_4vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with four outputs.
*
* Vector dot product optimized to calculate four ouputs at a time. Does four
* GF(2^8) dot products across each byte of the input array and four constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 4*32*vlen byte constant array based on the four sets of input coefficients.
* @requires AVX
*
* @param len Length of each vector in bytes. Must be >= 16.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_4vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with four outputs.
*
* Vector dot product optimized to calculate four ouputs at a time. Does four
* GF(2^8) dot products across each byte of the input array and four constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 4*32*vlen byte constant array based on the four sets of input coefficients.
* @requires AVX2
*
* @param len Length of each vector in bytes. Must be >= 32.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_4vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with five outputs.
*
* Vector dot product optimized to calculate five ouputs at a time. Does five
* GF(2^8) dot products across each byte of the input array and five constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 5*32*vlen byte constant array based on the five sets of input coefficients.
* @requires SSE4.1
*
* @param len Length of each vector in bytes. Must >= 16.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_5vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with five outputs.
*
* Vector dot product optimized to calculate five ouputs at a time. Does five
* GF(2^8) dot products across each byte of the input array and five constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 5*32*vlen byte constant array based on the five sets of input coefficients.
* @requires AVX
*
* @param len Length of each vector in bytes. Must >= 16.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_5vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with five outputs.
*
* Vector dot product optimized to calculate five ouputs at a time. Does five
* GF(2^8) dot products across each byte of the input array and five constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 5*32*vlen byte constant array based on the five sets of input coefficients.
* @requires AVX2
*
* @param len Length of each vector in bytes. Must >= 32.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_5vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with six outputs.
*
* Vector dot product optimized to calculate six ouputs at a time. Does six
* GF(2^8) dot products across each byte of the input array and six constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 6*32*vlen byte constant array based on the six sets of input coefficients.
* @requires SSE4.1
*
* @param len Length of each vector in bytes. Must be >= 16.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_6vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with six outputs.
*
* Vector dot product optimized to calculate six ouputs at a time. Does six
* GF(2^8) dot products across each byte of the input array and six constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 6*32*vlen byte constant array based on the six sets of input coefficients.
* @requires AVX
*
* @param len Length of each vector in bytes. Must be >= 16.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_6vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product with six outputs.
*
* Vector dot product optimized to calculate six ouputs at a time. Does six
* GF(2^8) dot products across each byte of the input array and six constant
* sets of coefficients to produce each byte of the outputs. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 6*32*vlen byte constant array based on the six sets of input coefficients.
* @requires AVX2
*
* @param len Length of each vector in bytes. Must be >= 32.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
* based on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Array of pointers to destination data buffers.
* @returns none
*/
void gf_6vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char **dest);
/**
* @brief GF(2^8) vector dot product, runs baseline version.
*
* Does a GF(2^8) dot product across each byte of the input array and a constant
* set of coefficients to produce each byte of the output. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 32*vlen byte constant array based on the input coefficients.
*
* @param len Length of each vector in bytes. Must be >= 16.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
* on the array of input coefficients. Only elements 32*CONST*j + 1
* of this array are used, where j = (0, 1, 2...) and CONST is the
* number of elements in the array of input coefficients. The
* elements used correspond to the original input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Pointer to destination data array.
* @returns none
*/
void gf_vect_dot_prod_base(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char *dest);
/**
* @brief GF(2^8) vector dot product, runs appropriate version.
*
* Does a GF(2^8) dot product across each byte of the input array and a constant
* set of coefficients to produce each byte of the output. Can be used for
* erasure coding encode and decode. Function requires pre-calculation of a
* 32*vlen byte constant array based on the input coefficients.
*
* This function determines what instruction sets are enabled and
* selects the appropriate version at runtime.
*
* @param len Length of each vector in bytes. Must be >= 32.
* @param vlen Number of vector sources.
* @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
* on the array of input coefficients.
* @param src Array of pointers to source inputs.
* @param dest Pointer to destination data array.
* @returns none
*/
void gf_vect_dot_prod(int len, int vlen, unsigned char *gftbls,
unsigned char **src, unsigned char *dest);
/**
* @brief GF(2^8) vector multiply accumulate, runs appropriate version.
*
* Does a GF(2^8) multiply across each byte of input source with expanded
* constant and add to destination array. Can be used for erasure coding encode
* and decode update when only one source is available at a time. Function
* requires pre-calculation of a 32*vec byte constant array based on the input
* coefficients.
*
* This function determines what instruction sets are enabled and selects the
* appropriate version at runtime.
*
* @param len Length of each vector in bytes. Must be >= 32.
* @param vec The number of vector sources or rows in the generator matrix
* for coding.
* @param vec_i The vector index corresponding to the single input source.
* @param gftbls Pointer to array of input tables generated from coding
* coefficients in ec_init_tables(). Must be of size 32*vec.
* @param src Array of pointers to source inputs.
* @param dest Pointer to destination data array.
* @returns none
*/
void gf_vect_mad(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char *dest);
/**
* @brief GF(2^8) vector multiply accumulate, arch specific version.
*
* Arch specific version of gf_vect_mad() with same parameters.
* @requires SSE4.1
*/
void gf_vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char *dest);
/**
* @brief GF(2^8) vector multiply accumulate, arch specific version.
*
* Arch specific version of gf_vect_mad() with same parameters.
* @requires AVX
*/
void gf_vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char *dest);
/**
* @brief GF(2^8) vector multiply accumulate, arch specific version.
*
* Arch specific version of gf_vect_mad() with same parameters.
* @requires AVX2
*/
void gf_vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char *dest);
/**
* @brief GF(2^8) vector multiply accumulate, baseline version.
*
* Baseline version of gf_vect_mad() with same parameters.
*/
void gf_vect_mad_base(int len, int vec, int vec_i, unsigned char *v, unsigned char *src,
unsigned char *dest);
/**
* @brief GF(2^8) vector multiply with 2 accumulate. SSE version.
*
* Does a GF(2^8) multiply across each byte of input source with expanded
* constants and add to destination arrays. Can be used for erasure coding
* encode and decode update when only one source is available at a
* time. Function requires pre-calculation of a 32*vec byte constant array based
* on the input coefficients.
* @requires SSE4.1
*
* @param len Length of each vector in bytes. Must be >= 32.
* @param vec The number of vector sources or rows in the generator matrix
* for coding.
* @param vec_i The vector index corresponding to the single input source.
* @param gftbls Pointer to array of input tables generated from coding
* coefficients in ec_init_tables(). Must be of size 32*vec.
* @param src Pointer to source input array.
* @param dest Array of pointers to destination input/outputs.
* @returns none
*/
void gf_2vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 2 accumulate. AVX version of gf_2vect_mad_sse().
* @requires AVX
*/
void gf_2vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 2 accumulate. AVX2 version of gf_2vect_mad_sse().
* @requires AVX2
*/
void gf_2vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 3 accumulate. SSE version.
*
* Does a GF(2^8) multiply across each byte of input source with expanded
* constants and add to destination arrays. Can be used for erasure coding
* encode and decode update when only one source is available at a
* time. Function requires pre-calculation of a 32*vec byte constant array based
* on the input coefficients.
* @requires SSE4.1
*
* @param len Length of each vector in bytes. Must be >= 32.
* @param vec The number of vector sources or rows in the generator matrix
* for coding.
* @param vec_i The vector index corresponding to the single input source.
* @param gftbls Pointer to array of input tables generated from coding
* coefficients in ec_init_tables(). Must be of size 32*vec.
* @param src Pointer to source input array.
* @param dest Array of pointers to destination input/outputs.
* @returns none
*/
void gf_3vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 3 accumulate. AVX version of gf_3vect_mad_sse().
* @requires AVX
*/
void gf_3vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 3 accumulate. AVX2 version of gf_3vect_mad_sse().
* @requires AVX2
*/
void gf_3vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 4 accumulate. SSE version.
*
* Does a GF(2^8) multiply across each byte of input source with expanded
* constants and add to destination arrays. Can be used for erasure coding
* encode and decode update when only one source is available at a
* time. Function requires pre-calculation of a 32*vec byte constant array based
* on the input coefficients.
* @requires SSE4.1
*
* @param len Length of each vector in bytes. Must be >= 32.
* @param vec The number of vector sources or rows in the generator matrix
* for coding.
* @param vec_i The vector index corresponding to the single input source.
* @param gftbls Pointer to array of input tables generated from coding
* coefficients in ec_init_tables(). Must be of size 32*vec.
* @param src Pointer to source input array.
* @param dest Array of pointers to destination input/outputs.
* @returns none
*/
void gf_4vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 4 accumulate. AVX version of gf_4vect_mad_sse().
* @requires AVX
*/
void gf_4vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 4 accumulate. AVX2 version of gf_4vect_mad_sse().
* @requires AVX2
*/
void gf_4vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 5 accumulate. SSE version.
* @requires SSE4.1
*/
void gf_5vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 5 accumulate. AVX version.
* @requires AVX
*/
void gf_5vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 5 accumulate. AVX2 version.
* @requires AVX2
*/
void gf_5vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 6 accumulate. SSE version.
* @requires SSE4.1
*/
void gf_6vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 6 accumulate. AVX version.
* @requires AVX
*/
void gf_6vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**
* @brief GF(2^8) vector multiply with 6 accumulate. AVX2 version.
* @requires AVX2
*/
void gf_6vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
unsigned char **dest);
/**********************************************************************
* The remaining are lib support functions used in GF(2^8) operations.
*/
/**
* @brief Single element GF(2^8) multiply.
*
* @param a Multiplicand a
* @param b Multiplicand b
* @returns Product of a and b in GF(2^8)
*/
unsigned char gf_mul(unsigned char a, unsigned char b);
/**
* @brief Single element GF(2^8) inverse.
*
* @param a Input element
* @returns Field element b such that a x b = {1}
*/
unsigned char gf_inv(unsigned char a);
/**
* @brief Generate a matrix of coefficients to be used for encoding.
*
* Vandermonde matrix example of encoding coefficients where high portion of
* matrix is identity matrix I and lower portion is constructed as 2^{i*(j-k+1)}
* i:{0,k-1} j:{k,m-1}. Commonly used method for choosing coefficients in
* erasure encoding but does not guarantee invertable for every sub matrix. For
* large k it is possible to find cases where the decode matrix chosen from
* sources and parity not in erasure are not invertable. Users may want to
* adjust for k > 5.
*
* @param a [mxk] array to hold coefficients
* @param m number of rows in matrix corresponding to srcs + parity.
* @param k number of columns in matrix corresponding to srcs.
* @returns none
*/
void gf_gen_rs_matrix(unsigned char *a, int m, int k);
/**
* @brief Generate a Cauchy matrix of coefficients to be used for encoding.
*
* Cauchy matrix example of encoding coefficients where high portion of matrix
* is identity matrix I and lower portion is constructed as 1/(i + j) | i != j,
* i:{0,k-1} j:{k,m-1}. Any sub-matrix of a Cauchy matrix should be invertable.
*
* @param a [mxk] array to hold coefficients
* @param m number of rows in matrix corresponding to srcs + parity.
* @param k number of columns in matrix corresponding to srcs.
* @returns none
*/
void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k);
/**
* @brief Invert a matrix in GF(2^8)
*
* @param in input matrix
* @param out output matrix such that [in] x [out] = [I] - identity matrix
* @param n size of matrix [nxn]
* @returns 0 successful, other fail on singular input matrix
*/
int gf_invert_matrix(unsigned char *in, unsigned char *out, const int n);
/*************************************************************/
#ifdef __cplusplus
}
#endif
#endif //_ERASURE_CODE_H_

148
include/gf_vect_mul.h Normal file
View File

@ -0,0 +1,148 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#ifndef _GF_VECT_MUL_H
#define _GF_VECT_MUL_H
/**
* @file gf_vect_mul.h
* @brief Interface to functions for vector (block) multiplication in GF(2^8).
*
* This file defines the interface to routines used in fast RAID rebuild and
* erasure codes.
*/
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief GF(2^8) vector multiply by constant.
*
* Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
* is a single field element in GF(2^8). Can be used for RAID6 rebuild
* and partial write functions. Function requires pre-calculation of a
* 32-element constant array based on constant C. gftbl(C) = {C{00},
* C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
* and src must be aligned to 32B.
* @requires SSE4.1
*
* @param len Length of vector in bytes. Must be aligned to 32B.
* @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
* @param src Pointer to src data array. Must be aligned to 32B.
* @param dest Pointer to destination data array. Must be aligned to 32B.
* @returns 0 pass, other fail
*/
int gf_vect_mul_sse(int len, unsigned char *gftbl, void *src, void *dest);
/**
* @brief GF(2^8) vector multiply by constant.
*
* Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
* is a single field element in GF(2^8). Can be used for RAID6 rebuild
* and partial write functions. Function requires pre-calculation of a
* 32-element constant array based on constant C. gftbl(C) = {C{00},
* C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
* and src must be aligned to 32B.
* @requires AVX
*
* @param len Length of vector in bytes. Must be aligned to 32B.
* @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
* @param src Pointer to src data array. Must be aligned to 32B.
* @param dest Pointer to destination data array. Must be aligned to 32B.
* @returns 0 pass, other fail
*/
int gf_vect_mul_avx(int len, unsigned char *gftbl, void *src, void *dest);
/**
* @brief GF(2^8) vector multiply by constant, runs appropriate version.
*
* Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
* is a single field element in GF(2^8). Can be used for RAID6 rebuild
* and partial write functions. Function requires pre-calculation of a
* 32-element constant array based on constant C. gftbl(C) = {C{00},
* C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }.
* Len and src must be aligned to 32B.
*
* This function determines what instruction sets are enabled
* and selects the appropriate version at runtime.
*
* @param len Length of vector in bytes. Must be aligned to 32B.
* @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
* @param src Pointer to src data array. Must be aligned to 32B.
* @param dest Pointer to destination data array. Must be aligned to 32B.
* @returns 0 pass, other fail
*/
int gf_vect_mul(int len, unsigned char *gftbl, void *src, void *dest);
/**
* @brief Initialize 32-byte constant array for GF(2^8) vector multiply
*
* Calculates array {C{00}, C{01}, C{02}, ... , C{0f} }, {C{00}, C{10},
* C{20}, ... , C{f0} } as required by other fast vector multiply
* functions.
* @param c Constant input.
* @param gftbl Table output.
*/
void gf_vect_mul_init(unsigned char c, unsigned char* gftbl);
/**
* @brief GF(2^8) vector multiply by constant, runs baseline version.
*
* Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
* is a single field element in GF(2^8). Can be used for RAID6 rebuild
* and partial write functions. Function requires pre-calculation of a
* 32-element constant array based on constant C. gftbl(C) = {C{00},
* C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
* and src must be aligned to 32B.
*
* @param len Length of vector in bytes. Must be aligned to 32B.
* @param a Pointer to 32-byte array of pre-calculated constants based on C.
* only use 2nd element is used.
* @param src Pointer to src data array. Must be aligned to 32B.
* @param dest Pointer to destination data array. Must be aligned to 32B.
*/
void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src,
unsigned char *dest);
#ifdef __cplusplus
}
#endif
#endif //_GF_VECT_MUL_H

123
include/reg_sizes.asm Normal file
View File

@ -0,0 +1,123 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in
; the documentation and/or other materials provided with the
; distribution.
; * Neither the name of Intel Corporation nor the names of its
; contributors may be used to endorse or promote products derived
; from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%ifndef _REG_SIZES_ASM_
%define _REG_SIZES_ASM_
%define EFLAGS_HAS_CPUID (1<<21)
%define FLAG_CPUID1_ECX_CLMUL (1<<1)
%define FLAG_CPUID1_EDX_SSE2 (1<<26)
%define FLAG_CPUID1_ECX_SSE3 (1)
%define FLAG_CPUID1_ECX_SSE4_1 (1<<19)
%define FLAG_CPUID1_ECX_SSE4_2 (1<<20)
%define FLAG_CPUID1_ECX_POPCNT (1<<23)
%define FLAG_CPUID1_ECX_AESNI (1<<25)
%define FLAG_CPUID1_ECX_OSXSAVE (1<<27)
%define FLAG_CPUID1_ECX_AVX (1<<28)
%define FLAG_CPUID1_EBX_AVX2 (1<<5)
%define FLAG_XGETBV_EAX_XMM_YMM 0x6
%define FLAG_CPUID1_EAX_AVOTON 0x000406d0
; define d and w variants for registers
%define raxd eax
%define raxw ax
%define raxb al
%define rbxd ebx
%define rbxw bx
%define rbxb bl
%define rcxd ecx
%define rcxw cx
%define rcxb cl
%define rdxd edx
%define rdxw dx
%define rdxb dl
%define rsid esi
%define rsiw si
%define rsib sil
%define rdid edi
%define rdiw di
%define rdib dil
%define rbpd ebp
%define rbpw bp
%define rbpb bpl
%define ymm0x xmm0
%define ymm1x xmm1
%define ymm2x xmm2
%define ymm3x xmm3
%define ymm4x xmm4
%define ymm5x xmm5
%define ymm6x xmm6
%define ymm7x xmm7
%define ymm8x xmm8
%define ymm9x xmm9
%define ymm10x xmm10
%define ymm11x xmm11
%define ymm12x xmm12
%define ymm13x xmm13
%define ymm14x xmm14
%define ymm15x xmm15
%define DWORD(reg) reg %+ d
%define WORD(reg) reg %+ w
%define BYTE(reg) reg %+ b
%define XWORD(reg) reg %+ x
%ifidn __OUTPUT_FORMAT__,elf32
section .note.GNU-stack noalloc noexec nowrite progbits
section .text
%endif
%ifidn __OUTPUT_FORMAT__,elf64
section .note.GNU-stack noalloc noexec nowrite progbits
section .text
%endif
%ifidn __OUTPUT_FORMAT__, macho64
%define elf64 macho64
%endif
%macro slversion 4
section .text
global %1_slver_%2%3%4
global %1_slver
%1_slver:
%1_slver_%2%3%4:
dw 0x%4
db 0x%3, 0x%2
%endmacro
%endif ; ifndef _REG_SIZES_ASM_

81
include/test.h Normal file
View File

@ -0,0 +1,81 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
#ifndef _TEST_H
#define _TEST_H
#ifdef __cplusplus
extern "C" {
#endif
// Use sys/time.h functions for time
#include <sys/time.h>
struct perf{
struct timeval tv;
};
inline int perf_start(struct perf *p)
{
return gettimeofday(&(p->tv), 0);
}
inline int perf_stop(struct perf *p)
{
return gettimeofday(&(p->tv), 0);
}
inline void perf_print(struct perf stop, struct perf start, long long dsize)
{
long long secs = stop.tv.tv_sec - start.tv.tv_sec;
long long usecs = secs * 1000000 + stop.tv.tv_usec - start.tv.tv_usec;
printf("runtime = %10lld usecs", usecs);
if (dsize != 0) {
#if 1 // not bug in printf for 32-bit
printf(", bandwidth %lld MB in %.4f sec = %.2f MB/s\n", dsize/(1024*1024),
((double) usecs)/1000000, ((double) dsize) / (double)usecs);
#else
printf(", bandwidth %lld MB ", dsize/(1024*1024));
printf("in %.4f sec ",(double)usecs/1000000);
printf("= %.2f MB/s\n", (double)dsize/usecs);
#endif
}
else
printf("\n");
}
#ifdef __cplusplus
}
#endif
#endif // _TEST_H

88
include/types.h Normal file
View File

@ -0,0 +1,88 @@
/**********************************************************************
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
/**
* @file types.h
* @brief Defines standard width types.
*
*/
#ifndef __TYPES_H
#define __TYPES_H
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __WIN32__
#ifdef __MINGW32__
# include <_mingw.h>
#endif
typedef unsigned __int64 UINT64;
typedef __int64 INT64;
typedef unsigned __int32 UINT32;
typedef unsigned __int16 UINT16;
typedef unsigned char UINT8;
#else
typedef unsigned long int UINT64;
typedef long int INT64;
typedef unsigned int UINT32;
typedef unsigned short int UINT16;
typedef unsigned char UINT8;
#endif
#if defined __unix__ || defined __APPLE__
# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
# define __forceinline static inline
# define aligned_free(x) free(x)
#else
# ifdef __MINGW32__
# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
# define aligned_free(x) _aligned_free(x)
# else
# define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl
# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
# define aligned_free(x) _aligned_free(x)
# endif
#endif
#ifdef DEBUG
# define DEBUG_PRINT(x) printf x
#else
# define DEBUG_PRINT(x) do {} while (0)
#endif
#ifdef __cplusplus
}
#endif
#endif //__TYPES_H

56
isa-l.def Normal file
View File

@ -0,0 +1,56 @@
LIBRARY isa-l
EXPORTS
ec_encode_data_sse @1
ec_init_tables @2
gf_gen_cauchy1_matrix @3
gf_gen_rs_matrix @4
gf_invert_matrix @5
gf_mul @6
gf_vect_dot_prod_base @7
gf_vect_mul_base @8
ec_encode_data_base @9
gf_vect_mul_init @10
gf_vect_mul_sse @11
gf_vect_mul_avx @12
gf_vect_dot_prod_sse @13
gf_vect_dot_prod_avx @14
gf_vect_dot_prod_avx2 @15
gf_2vect_dot_prod_sse @16
gf_3vect_dot_prod_sse @17
gf_4vect_dot_prod_sse @18
gf_5vect_dot_prod_sse @19
gf_6vect_dot_prod_sse @20
gf_2vect_dot_prod_avx @21
gf_3vect_dot_prod_avx @22
gf_4vect_dot_prod_avx @23
gf_5vect_dot_prod_avx @24
gf_6vect_dot_prod_avx @25
gf_2vect_dot_prod_avx2 @26
gf_3vect_dot_prod_avx2 @27
gf_4vect_dot_prod_avx2 @28
gf_5vect_dot_prod_avx2 @29
gf_6vect_dot_prod_avx2 @30
gf_vect_mad_sse @31
gf_2vect_mad_sse @32
gf_3vect_mad_sse @33
gf_4vect_mad_sse @34
gf_5vect_mad_sse @35
gf_6vect_mad_sse @36
gf_vect_mad_avx @37
gf_2vect_mad_avx @38
gf_3vect_mad_avx @39
gf_4vect_mad_avx @40
gf_5vect_mad_avx @41
gf_6vect_mad_avx @42
gf_vect_mad_avx2 @43
gf_2vect_mad_avx2 @44
gf_3vect_mad_avx2 @45
gf_4vect_mad_avx2 @46
gf_5vect_mad_avx2 @47
gf_6vect_mad_avx2 @48
ec_encode_data @49
gf_vect_mul @50
ec_encode_data_update @51
gf_vect_dot_prod @52
gf_vect_mad @53

11
libisal.pc.in Normal file
View File

@ -0,0 +1,11 @@
prefix=@prefix@
exec_prefix=@exec_prefix@
libdir=@libdir@
includedir=@includedir@
Name: libisal
Description: Library for storage systems
Version: @VERSION@
Libs: -L${libdir} -lisal
Libs.private:
Cflags: -I${includedir}

246
make.inc Normal file
View File

@ -0,0 +1,246 @@
########################################################################
# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Intel Corporation nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
########################################################################
# Makefile include for optimized libraries
# make targets:
# lib - build library of optimized functions
# slib - build shared library
# test - run unit tests of functions
# perf - run performance tests
# install - install headers and libs to system location
# sim - run on simulator
# trace - get simulator trace
# clean - remove object files
version ?= #auto filled on release
CC = gcc
AS = yasm
SIM = sde $(SIMFLAGS) --
DEBUG = -g
DEBUG_yasm = -g dwarf2
DEBUG_nasm = -g
# Default arch= build options
CFLAGS_gcc = -Wall
ASFLAGS_ = -f elf64
ARFLAGS_ = cr $@
STRIP_gcc = strip -d -R .comment $@
# arch=32 build options
ASFLAGS_32 = -f elf32
CFLAGS_32 = -m32
ARFLAGS_32 = cr $@
# arch=win64 build options
ASFLAGS_win64 = -f win64
CFLAGS_icl = -Qstd=c99
ARFLAGS_win64 = -out:$@
# arch=mingw build options
ASFLAGS_mingw = -f win64
ARFLAGS_mingw = cr $@
lsrcmingw = $(lsrc)
unit_testsmingw = $(unit_tests)
examplesmingw = $(examples)
perf_testsmingw = $(perf_tests)
ifeq ($(arch),mingw)
CC=x86_64-w64-mingw32-gcc
AR=x86_64-w64-mingw32-ar
LDFLAGS = -Wl,--force-exe-suffix
endif
INCLUDE = $(patsubst %,-I%,$(subst :, ,$(VPATH)))
CFLAGS = $(CFLAGS_$(arch)) $(CFLAGS_$(CC)) $(DEBUG) -O2 $(DEFINES) $(INCLUDE)
ASFLAGS = $(ASFLAGS_$(arch)) $(ASFLAGS_$(CC)) $(DEBUG_$(AS)) $(DEFINES) $(INCLUDE)
ARFLAGS = $(ARFLAGS_$(arch))
DEFINES += $(addprefix -D , $D)
O = bin
lobj += $(patsubst %.c,%.o,$(patsubst %.asm,%.o,$(lsrc$(arch)) $(lsrc_intrinsic)))
objs = $(addprefix $(O)/,$(notdir $(lobj)))
lib_name ?= isa-l.a
default: lib slib
# Defaults for windows build
ifeq ($(arch),win64)
AR=lib
CC=cl
OUTPUT_OPTION = -Fo$@
DEBUG=
lib_name := $(basename $(lib_name)).lib
endif
lsrcwin64 = $(lsrc)
unit_testswin64 = $(unit_tests)
exampleswin64 = $(examples)
perf_testswin64 = $(perf_tests)
# Build and run unit tests, performance tests, etc.
all_tests = $(notdir $(sort $(perf_tests$(arch)) $(check_tests$(arch)) $(unit_tests$(arch)) $(examples$(arch)) $(other_tests)))
all_unit_tests = $(notdir $(sort $(check_tests$(arch)) $(unit_tests$(arch))))
$(all_unit_tests): % : %.c $(lib_name)
$(sort $(notdir $(perf_tests$(arch)))): % : %.c $(lib_name)
$(sort $(examples$(arch))): % : %.c $(lib_name)
$(sort $(other_tests)): % : %.c $(lib_name)
sim test trace: $(addsuffix .run,$(all_unit_tests))
perf: $(addsuffix .run,$(notdir $(perf_tests$(arch))))
ex: $(examples$(arch))
all: lib $(all_tests)
other: $(other_tests)
tests: $(all_unit_tests)
perfs: $(notdir $(perf_tests$(arch)))
check test perf: SIM=
trace: SIMFLAGS = -debugtrace
check test sim:
@echo Finished running $@
#$(foreach c, $^, ./$c )
#for i in $^; do ./$$i ; done
$(objs): | $(O)
$(O): ; mkdir -p $(O)
# Build rule to run tests
%.run: %
$(SIM) $(@D)/$<
@echo Completed run: $<
# Other build rules
msg = $(if $(DEBUG),DEBUG) $(patsubst 32,32-bit,$(arch)) $D
$(O)/%.o: %.asm
@echo " ---> Building $< $(msg)"
@$(AS) $(ASFLAGS) -o $@ $<
$(O)/%.o %.o: %.c
@echo " ---> Building $< $(msg)"
@$(COMPILE.c) $(OUTPUT_OPTION) $<
$(all_tests):
@echo " ---> Building Test $@ $(msg)"
@$(LINK.o) $(CFLAGS) $^ $(LDLIBS) -o $@
# Target to build lib files
lib: $(lib_name)
ifneq ($(lib_debug),1)
$(lib_name): DEBUG_$(AS)= # Don't put debug symbols in the lib
$(lib_name): DEBUG=
$(lib_name): DEFINES+=-D NDEBUG
endif
ifeq ($(lib_debug),1)
DEBUG+=-D DEBUG # Define DEBUG for macros
endif
#lib $(lib_name): $(lib_name)(${objs})
$(lib_name): $(objs)
@echo " ---> Creating Lib $@"
@$(AR) $(ARFLAGS) $^
@$(STRIP_$(CC))
# Target for shared lib
so_lib_name = bin/libisal.so
so_lib_inst = $(notdir $(so_lib_name))
so_lib_ver = $(so_lib_inst).$(version)
soname = $(so_lib_inst).$(word 1, $(subst ., ,$(version)))
slib: $(so_lib_name)
aobjs += $(addprefix $(O)/,$(patsubst %.asm,%.o,$(filter %.asm,$(notdir $(lsrc$(arch)) $(lsrc_intrinsic)))))
shared_objs += $(addprefix $(O)/shared_ver_,$(patsubst %.c,%.o,$(filter %.c,$(notdir $(lsrc$(arch)) $(lsrc_intrinsic)))))
$(O)/shared_ver_%.o: %.c
@echo " ---> Building shared $< $(msg)"
@$(COMPILE.c) $(OUTPUT_OPTION) $<
ifneq ($(lib_debug),1)
$(so_lib_name): DEBUG_$(AS)=
$(so_lib_name): DEBUG=
$(so_lib_name): DEFINES+=-D NDEBUG
endif
$(shared_objs): CFLAGS += -fPIC
$(shared_objs) $(aobjs): | $(O)
$(so_lib_name): LDFLAGS+=-Wl,-soname,$(soname)
$(so_lib_name): $(shared_objs) $(aobjs)
@echo " ---> Creating Shared Lib $@"
@$(CC) $(CFLAGS) --shared $(LDFLAGS) -o $@ $^
@(cd $(@D); ln -f -s $(so_lib_inst) $(soname))
# Target for install
prefix = /usr/local
install_dirs = $(prefix)/lib $(prefix)/include/isa-l
$(install_dirs): ; mkdir -p $@
install: $(sort $(extern_hdrs)) | $(install_dirs) $(lib_name) $(so_lib_name) isa-l.h
install -m 644 $(lib_name) $(prefix)/lib/libisal.a
install -m 644 $^ $(prefix)/include/isa-l/.
install -m 664 include/isa-l.h $(prefix)/include/.
install -m 664 $(so_lib_name) $(prefix)/lib/$(so_lib_ver)
(cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(soname) && ln -f -s $(so_lib_ver) $(so_lib_inst))
ifeq ($(shell uname),Darwin)
(cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(basename $(so_lib_inst)).dylib)
endif
which libtool && libtool --mode=finish $(prefix)/lib || \
echo 'Lib installed at $(prefix)/lib. Run system-dependent programs to add shared lib path.'
uninstall:
$(RM) $(prefix)/lib/libisal.a
$(RM) $(prefix)/lib/$(soname)
$(RM) $(prefix)/lib/$(so_lib_ver)
$(RM) $(prefix)/lib/$(so_lib_inst)
$(RM) -r $(prefix)/include/isa-l
$(RM) $(prefix)/include/isa-l.h
$(RM) $(prefix)/lib/$(basename $(so_lib_inst)).dylib
# Collect performance data
rpt_name = perf_report_$(shell uname -n)_$(shell date +%y%m%d).perf
perf_report:
echo Results for $(rpt_name) >> $(rpt_name)
$(MAKE) -k perf | tee -a $(rpt_name)
@echo Summary:
-grep runtime $(rpt_name)
clean:
@echo Cleaning up
@$(RM) -r $(O) *.o *.a $(all_tests) $(lib_name) $(so_lib_name)

31
tools/yasm-filter.sh Executable file
View File

@ -0,0 +1,31 @@
#/bin/sh
# Filter out unnecessary options added by automake
while [ -n "$*" ]; do
case "$1" in
-f | -o | -I | -i | -D )
# Supported options with arg
options="$options $1 $2"
shift
shift
;;
-I* | -i* | --prefix* )
# Supported options without arg
options="$options $1"
shift
;;
#-blah )
# Unsupported options with args - none known
-* )
# Unsupported options with no args
shift
;;
* )
args="$args $1"
shift
;;
esac
done
yasm $options $args