mirror of
https://github.com/intel/isa-l.git
synced 2024-12-12 09:23:50 +01:00
Initial commit isa-l v2.14.1
Signed-off-by: Greg Tucker <greg.b.tucker@intel.com>
This commit is contained in:
commit
00c1efc109
26
About_bsd.txt
Normal file
26
About_bsd.txt
Normal file
@ -0,0 +1,26 @@
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
100
Makefile.am
Normal file
100
Makefile.am
Normal file
@ -0,0 +1,100 @@
|
||||
EXTRA_DIST = autogen.sh Makefile.unx make.inc Makefile.nmake isa-l.def About_bsd.txt
|
||||
CLEANFILES =
|
||||
LDADD =
|
||||
AM_MAKEFLAGS = --no-print-directory
|
||||
noinst_HEADERS =
|
||||
pkginclude_HEADERS = include/test.h
|
||||
noinst_LTLIBRARIES =
|
||||
INCLUDE = -I $(srcdir)/include
|
||||
AM_CFLAGS = ${my_CFLAGS} ${INCLUDE} ${D}
|
||||
|
||||
lsrc=
|
||||
extern_hdrs=
|
||||
other_src=
|
||||
check_tests=
|
||||
unit_tests=
|
||||
perf_tests=
|
||||
unit_tests_extra=
|
||||
perf_tests_extra=
|
||||
examples=
|
||||
other_tests=
|
||||
lsrc32=
|
||||
unit_tests32=
|
||||
perf_tests32=
|
||||
|
||||
# Include units
|
||||
include erasure_code/Makefile.am
|
||||
|
||||
# LIB version info not necessarily the same as package version
|
||||
LIBISAL_CURRENT=2
|
||||
LIBISAL_REVISION=14
|
||||
LIBISAL_AGE=0
|
||||
|
||||
lib_LTLIBRARIES = libisal.la
|
||||
pkginclude_HEADERS += $(sort ${extern_hdrs})
|
||||
libisal_la_SOURCES = ${lsrc}
|
||||
nobase_include_HEADERS = isa-l.h
|
||||
libisal_la_LDFLAGS = $(AM_LDFLAGS) \
|
||||
-version-info $(LIBISAL_CURRENT):$(LIBISAL_REVISION):$(LIBISAL_AGE)
|
||||
libisal_la_LIBADD = ${noinst_LTLIBRARIES}
|
||||
|
||||
EXTRA_DIST += ${other_src}
|
||||
EXTRA_DIST += Release_notes.txt
|
||||
|
||||
# For tests
|
||||
LDADD += libisal.la
|
||||
check_PROGRAMS = ${check_tests}
|
||||
TESTS = ${check_tests}
|
||||
|
||||
# For additional tests
|
||||
EXTRA_PROGRAMS = ${unit_tests}
|
||||
EXTRA_PROGRAMS += ${perf_tests}
|
||||
EXTRA_PROGRAMS += ${other_tests}
|
||||
EXTRA_PROGRAMS += ${examples}
|
||||
CLEANFILES += ${EXTRA_PROGRAMS}
|
||||
|
||||
perfs: ${perf_tests}
|
||||
tests: ${unit_tests}
|
||||
other: ${other_tests}
|
||||
perf: $(addsuffix .run,$(perf_tests))
|
||||
ex: ${examples}
|
||||
test: $(addsuffix .run,$(unit_tests))
|
||||
|
||||
# Build rule to run tests
|
||||
%.run: %
|
||||
$<
|
||||
@echo Completed run: $<
|
||||
|
||||
# Support for yasm
|
||||
CCAS = ${srcdir}/tools/yasm-filter.sh
|
||||
EXTRA_DIST += tools/yasm-filter.sh
|
||||
AM_CCASFLAGS = ${yasm_args} ${INCLUDE}
|
||||
|
||||
.asm.s:
|
||||
@echo " MKTMP " $@;
|
||||
@cp $< $@
|
||||
|
||||
# Generate isa-l.h
|
||||
BUILT_SOURCES = isa-l.h
|
||||
CLEANFILES += isa-l.h
|
||||
isa-l.h:
|
||||
@echo 'Building $@'
|
||||
@echo '' >> $@
|
||||
@echo '#ifndef _ISAL_H_' >> $@
|
||||
@echo '#define _ISAL_H_' >> $@
|
||||
@echo '' >> $@
|
||||
@echo '#define.ISAL_MAJOR_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$3}' >> $@
|
||||
@echo '#define.ISAL_MINOR_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$4}' >> $@
|
||||
@echo '#define.ISAL_PATCH_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$5}' >> $@
|
||||
@echo '#define ISAL_MAKE_VERSION(maj, min, patch) ((maj) * 0x10000 + (min) * 0x100 + (patch))' >> $@
|
||||
@echo '#define ISAL_VERSION ISAL_MAKE_VERSION(ISAL_MAJOR_VERSION, ISAL_MINOR_VERSION, ISAL_PATCH_VERSION)' >> $@
|
||||
@echo '' >> $@
|
||||
@for unit in ${extern_hdrs}; do echo "#include <isa-l/$$unit>" | sed -e 's;include/;;' >> $@; done
|
||||
@echo '#endif //_ISAL_H_' >> $@
|
||||
|
||||
|
||||
license = bsd
|
||||
licc = $(srcdir)/doc/license_$(license)_c.txt
|
||||
lica = $(srcdir)/doc/license_$(license)_asm.txt
|
||||
licm = $(srcdir)/doc/license_$(license)_make.txt
|
||||
|
88
Makefile.nmake
Normal file
88
Makefile.nmake
Normal file
@ -0,0 +1,88 @@
|
||||
########################################################################
|
||||
# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
########################################################################
|
||||
|
||||
objs = bin\ec_base.obj bin\ec_highlevel_func.obj bin\ec_multibinary.obj bin\gf_2vect_dot_prod_avx.obj bin\gf_2vect_dot_prod_avx2.obj bin\gf_2vect_dot_prod_sse.obj bin\gf_2vect_mad_avx.obj bin\gf_2vect_mad_avx2.obj bin\gf_2vect_mad_sse.obj bin\gf_3vect_dot_prod_avx.obj bin\gf_3vect_dot_prod_avx2.obj bin\gf_3vect_dot_prod_sse.obj bin\gf_3vect_mad_avx.obj bin\gf_3vect_mad_avx2.obj bin\gf_3vect_mad_sse.obj bin\gf_4vect_dot_prod_avx.obj bin\gf_4vect_dot_prod_avx2.obj bin\gf_4vect_dot_prod_sse.obj bin\gf_4vect_mad_avx.obj bin\gf_4vect_mad_avx2.obj bin\gf_4vect_mad_sse.obj bin\gf_5vect_dot_prod_avx.obj bin\gf_5vect_dot_prod_avx2.obj bin\gf_5vect_dot_prod_sse.obj bin\gf_5vect_mad_avx.obj bin\gf_5vect_mad_avx2.obj bin\gf_5vect_mad_sse.obj bin\gf_6vect_dot_prod_avx.obj bin\gf_6vect_dot_prod_avx2.obj bin\gf_6vect_dot_prod_sse.obj bin\gf_6vect_mad_avx.obj bin\gf_6vect_mad_avx2.obj bin\gf_6vect_mad_sse.obj bin\gf_vect_dot_prod_avx.obj bin\gf_vect_dot_prod_avx2.obj bin\gf_vect_dot_prod_sse.obj bin\gf_vect_mad_avx.obj bin\gf_vect_mad_avx2.obj bin\gf_vect_mad_sse.obj bin\gf_vect_mul_avx.obj bin\gf_vect_mul_sse.obj
|
||||
|
||||
INCLUDES = -I. -Ierasure_code -Iinclude
|
||||
LINKFLAGS = /nologo
|
||||
CFLAGS = -O2 -D NDEBUG /nologo -D_USE_MATH_DEFINES -Qstd=c99 $(INCLUDES) $(D)
|
||||
AFLAGS = -f win64 $(INCLUDES) $(D)
|
||||
CC = icl
|
||||
AS = yasm
|
||||
|
||||
lib: bin static dll
|
||||
static: bin isa-l_static.lib
|
||||
dll: bin isa-l.dll
|
||||
|
||||
bin: ; -mkdir $@
|
||||
|
||||
isa-l_static.lib: $(objs)
|
||||
lib -out:$@ $?
|
||||
|
||||
isa-l.dll: $(objs)
|
||||
link -out:$@ -dll -def:isa-l.def $?
|
||||
|
||||
{erasure_code}.c.obj:
|
||||
$(CC) $(CFLAGS) /c -Fo$@ $?
|
||||
{erasure_code}.asm.obj:
|
||||
$(AS) $(AFLAGS) -o $@ $?
|
||||
|
||||
|
||||
|
||||
.obj.exe:
|
||||
link /out:$@ $(LINKFLAGS) isa-l.lib $?
|
||||
|
||||
# Check tests
|
||||
checks = erasure_code_test.exe erasure_code_update_test.exe gf_inverse_test.exe gf_vect_mul_test.exe
|
||||
|
||||
checks: lib $(checks)
|
||||
$(checks): $(@B).obj
|
||||
check: $(checks)
|
||||
!$?
|
||||
|
||||
# Unit tests
|
||||
tests = erasure_code_base_test.exe erasure_code_sse_test.exe gf_2vect_dot_prod_sse_test.exe gf_3vect_dot_prod_sse_test.exe gf_4vect_dot_prod_sse_test.exe gf_5vect_dot_prod_sse_test.exe gf_6vect_dot_prod_sse_test.exe gf_vect_dot_prod_avx_test.exe gf_vect_dot_prod_base_test.exe gf_vect_dot_prod_sse_test.exe gf_vect_dot_prod_test.exe gf_vect_mad_test.exe gf_vect_mul_avx_test.exe gf_vect_mul_base_test.exe gf_vect_mul_sse_test.exe
|
||||
|
||||
tests: lib $(tests)
|
||||
$(tests): $(@B).obj
|
||||
|
||||
# Performance tests
|
||||
perfs = erasure_code_base_perf.exe erasure_code_perf.exe erasure_code_sse_perf.exe erasure_code_update_perf.exe gf_2vect_dot_prod_sse_perf.exe gf_3vect_dot_prod_sse_perf.exe gf_4vect_dot_prod_sse_perf.exe gf_5vect_dot_prod_sse_perf.exe gf_6vect_dot_prod_sse_perf.exe gf_vect_dot_prod_1tbl.exe gf_vect_dot_prod_avx_perf.exe gf_vect_dot_prod_perf.exe gf_vect_dot_prod_sse_perf.exe gf_vect_mad_perf.exe gf_vect_mul_avx_perf.exe gf_vect_mul_perf.exe gf_vect_mul_sse_perf.exe
|
||||
|
||||
perfs: lib $(perfs)
|
||||
$(perfs): $(@B).obj
|
||||
|
||||
clean:
|
||||
-if exist *.obj del *.obj
|
||||
-if exist bin\*.obj del bin\*.obj
|
||||
-if exist isa-l_static.lib del isa-l_static.lib
|
||||
-if exist *.exe del *.exe
|
||||
-if exist isa-l.lib del isa-l.lib
|
||||
-if exist isa-l.dll del isa-l.dll
|
||||
|
41
Makefile.unx
Normal file
41
Makefile.unx
Normal file
@ -0,0 +1,41 @@
|
||||
########################################################################
|
||||
# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
########################################################################
|
||||
|
||||
units = erasure_code
|
||||
|
||||
default: lib
|
||||
|
||||
include $(foreach unit,$(units), $(unit)/Makefile.am)
|
||||
|
||||
# Override individual lib names to make one inclusive library.
|
||||
lib_name := bin/isa-l.a
|
||||
|
||||
include make.inc
|
||||
|
||||
VPATH = . $(units) include
|
19
README
Normal file
19
README
Normal file
@ -0,0 +1,19 @@
|
||||
=================================================
|
||||
Intel(R) Intelligent Storage Acceleration Library
|
||||
=================================================
|
||||
|
||||
Build Prerequisites
|
||||
===================
|
||||
|
||||
ISA-L requires yasm version 1.2 or later.
|
||||
|
||||
Building ISA-L
|
||||
==============
|
||||
|
||||
To build and install the library it is usually sufficient to run the following.
|
||||
|
||||
./configure
|
||||
make
|
||||
sudo make install
|
||||
|
||||
Other targets include: make check, make tests and make perfs.
|
74
Release_notes.txt
Normal file
74
Release_notes.txt
Normal file
@ -0,0 +1,74 @@
|
||||
=============================================================================
|
||||
v2.14 Intel Intelligent Storage Acceleration Library Release Notes
|
||||
Open Source Version
|
||||
=============================================================================
|
||||
|
||||
=============================================================================
|
||||
RELEASE NOTE CONTENTS
|
||||
=============================================================================
|
||||
1. KNOWN ISSUES
|
||||
2. FIXED ISSUES
|
||||
3. CHANGE LOG & FEATURES ADDED
|
||||
|
||||
=============================================================================
|
||||
1. KNOWN ISSUES
|
||||
=============================================================================
|
||||
|
||||
* Only erasure code unit included in open source version at this time.
|
||||
|
||||
* Perf tests do not run in Windows environment.
|
||||
|
||||
* 32-bit lib is not supported in Windows.
|
||||
|
||||
=============================================================================
|
||||
2. FIXED ISSUES
|
||||
=============================================================================
|
||||
v2.14
|
||||
|
||||
* Building in unit directories is no longer supported removing the issue of
|
||||
leftover object files causing the top-level make build to fail.
|
||||
|
||||
v2.10
|
||||
|
||||
* Fix for windows register save overlap in gf_{3-6}vect_dot_prod_sse.asm. Only
|
||||
affects windows versions of erasure code. GP register saves/restore were
|
||||
pushed to same stack area as XMM.
|
||||
|
||||
=============================================================================
|
||||
3. CHANGE LOG & FEATURES ADDED
|
||||
=============================================================================
|
||||
v2.14
|
||||
|
||||
* Autoconf and autotools build allows easier porting to additional systems.
|
||||
Previous make system still available to embedded users with Makefile.unx.
|
||||
|
||||
* Includes update for building on Mac OS X/darwin systems. Add --target=darwin
|
||||
to ./configure step.
|
||||
|
||||
v2.13
|
||||
|
||||
* Erasure code improvments
|
||||
- 32-bit port of optimized gf_vect_dot_prod() functions. This makes
|
||||
ec_encode_data() functions much faster on 32-bit processors.
|
||||
- Avoton performance improvements. Performance on Avoton for
|
||||
gf_vect_dot_prod() and ec_encode_data() can improve by as much as 20%.
|
||||
|
||||
v2.11
|
||||
|
||||
* Incremental erasure code. New functions added to erasure code to handle
|
||||
single source update of code blocks. The function ec_encode_data_update()
|
||||
works with parameters similar to ec_encode_data() but are called incrementally
|
||||
with each source block. These versions are useful when source blocks are not
|
||||
all available at once.
|
||||
|
||||
v2.10
|
||||
|
||||
* Erasure code updates
|
||||
- New AVX and AVX2 support functions.
|
||||
- Changes min len requirement on gf_vect_dot_prod() to 32 from 16.
|
||||
- Tests include both source and parity recovery with ec_encode_data().
|
||||
- New encoding examples with Vandermonde or Cauchy matrix.
|
||||
|
||||
v2.8
|
||||
|
||||
* First open release of erasure code unit that is part of ISA-L.
|
17
autogen.sh
Executable file
17
autogen.sh
Executable file
@ -0,0 +1,17 @@
|
||||
#!/bin/sh -e
|
||||
|
||||
autoreconf --install --symlink -f
|
||||
|
||||
libdir() {
|
||||
echo $(cd $1/$(gcc -print-multi-os-directory); pwd)
|
||||
}
|
||||
|
||||
args="--prefix=/usr --libdir=$(libdir /usr/lib)"
|
||||
|
||||
echo
|
||||
echo "----------------------------------------------------------------"
|
||||
echo "Initialized build system. For a common configuration please run:"
|
||||
echo "----------------------------------------------------------------"
|
||||
echo
|
||||
echo "./configure $args"
|
||||
echo
|
112
configure.ac
Normal file
112
configure.ac
Normal file
@ -0,0 +1,112 @@
|
||||
# -*- Autoconf -*-
|
||||
# Process this file with autoconf to produce a configure script.
|
||||
|
||||
AC_PREREQ(2.69)
|
||||
AC_INIT([libisal],
|
||||
[2.14.0],
|
||||
[sg.support.isal@intel.com],
|
||||
[isa-l],
|
||||
[http://01.org/storage-acceleration-library])
|
||||
AC_CONFIG_SRCDIR([])
|
||||
AC_CONFIG_AUX_DIR([build-aux])
|
||||
AM_INIT_AUTOMAKE([
|
||||
foreign
|
||||
1.11
|
||||
-Wall
|
||||
-Wno-portability
|
||||
silent-rules
|
||||
tar-pax
|
||||
no-dist-gzip
|
||||
dist-xz
|
||||
subdir-objects
|
||||
])
|
||||
AM_PROG_AS
|
||||
|
||||
# Check for programs
|
||||
AC_PROG_CC_STDC
|
||||
AC_USE_SYSTEM_EXTENSIONS
|
||||
AM_SILENT_RULES([yes])
|
||||
LT_INIT
|
||||
AC_PREFIX_DEFAULT([/usr])
|
||||
AC_PROG_SED
|
||||
AC_PROG_MKDIR_P
|
||||
AC_CHECK_PROG(HAVE_YASM, yasm, yes, no)
|
||||
if test "$HAVE_YASM" = "no"; then
|
||||
AC_MSG_ERROR([yasm not found as required.])
|
||||
fi
|
||||
AC_MSG_CHECKING([checking for modern yasm])
|
||||
AC_LANG_CONFTEST([AC_LANG_SOURCE([[vmovdqa %xmm0, %xmm1;]])])
|
||||
if yasm -f elf64 -p gas conftest.c ; then
|
||||
AC_MSG_RESULT([yes])
|
||||
else
|
||||
AC_MSG_FAILURE([need modern yasm])
|
||||
fi
|
||||
|
||||
# Options
|
||||
AC_ARG_ENABLE([debug],
|
||||
AS_HELP_STRING([--enable-debug], [enable debug messages @<:@default=disabled@:>@]),
|
||||
[], [enable_debug=no])
|
||||
AS_IF([test "x$enable_debug" = "xyes"], [
|
||||
AC_DEFINE(ENABLE_DEBUG, [1], [Debug messages.])
|
||||
])
|
||||
|
||||
|
||||
case $target in
|
||||
*linux*) arch=linux yasm_args="-f elf64";;
|
||||
*darwin*) arch=darwin yasm_args="-f macho64 --prefix=_ ";;
|
||||
*netbsd*) arch=netbsd yasm_args="-f elf64";;
|
||||
*) arch=unknown yasm_args="-f elf64";;
|
||||
esac
|
||||
AC_SUBST([yasm_args])
|
||||
AM_CONDITIONAL(DARWIN, test x"$arch" = x"darwin")
|
||||
AC_MSG_RESULT([Using yasm args target "$arch" "$yasm_args"])
|
||||
|
||||
# Check for header files
|
||||
#AC_CHECK_HEADERS([limits.h stddef.h stdint.h stdlib.h string.h sys/time.h unistd.h])
|
||||
AC_CHECK_HEADERS([limits.h stdint.h stdlib.h string.h])
|
||||
|
||||
# Checks for typedefs, structures, and compiler characteristics.
|
||||
AC_C_INLINE
|
||||
AC_TYPE_SIZE_T
|
||||
AC_TYPE_UINT16_T
|
||||
AC_TYPE_UINT32_T
|
||||
AC_TYPE_UINT64_T
|
||||
AC_TYPE_UINT8_T
|
||||
|
||||
# Checks for library functions.
|
||||
AC_FUNC_MALLOC # Used only in tests
|
||||
AC_CHECK_FUNCS([memmove memset])
|
||||
|
||||
my_CFLAGS="\
|
||||
-Wall \
|
||||
-Wchar-subscripts \
|
||||
-Wformat-security \
|
||||
-Wnested-externs \
|
||||
-Wpointer-arith \
|
||||
-Wshadow \
|
||||
-Wstrict-prototypes \
|
||||
-Wtype-limits \
|
||||
"
|
||||
AC_SUBST([my_CFLAGS])
|
||||
|
||||
AC_CONFIG_FILES([\
|
||||
Makefile\
|
||||
libisal.pc
|
||||
])
|
||||
|
||||
AC_OUTPUT
|
||||
AC_MSG_RESULT([
|
||||
$PACKAGE $VERSION
|
||||
=====
|
||||
|
||||
prefix: ${prefix}
|
||||
sysconfdir: ${sysconfdir}
|
||||
libdir: ${libdir}
|
||||
includedir: ${includedir}
|
||||
|
||||
compiler: ${CC}
|
||||
cflags: ${CFLAGS}
|
||||
ldflags: ${LDFLAGS}
|
||||
|
||||
debug: ${enable_debug}
|
||||
])
|
159
erasure_code/Makefile.am
Normal file
159
erasure_code/Makefile.am
Normal file
@ -0,0 +1,159 @@
|
||||
########################################################################
|
||||
# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
########################################################################
|
||||
|
||||
lsrc += erasure_code/ec_highlevel_func.c \
|
||||
erasure_code/ec_base.c \
|
||||
erasure_code/gf_vect_mul_sse.asm \
|
||||
erasure_code/gf_vect_mul_avx.asm \
|
||||
erasure_code/gf_vect_dot_prod_sse.asm \
|
||||
erasure_code/gf_vect_dot_prod_avx.asm \
|
||||
erasure_code/gf_vect_dot_prod_avx2.asm \
|
||||
erasure_code/gf_2vect_dot_prod_sse.asm \
|
||||
erasure_code/gf_3vect_dot_prod_sse.asm \
|
||||
erasure_code/gf_4vect_dot_prod_sse.asm \
|
||||
erasure_code/gf_5vect_dot_prod_sse.asm \
|
||||
erasure_code/gf_6vect_dot_prod_sse.asm \
|
||||
erasure_code/gf_2vect_dot_prod_avx.asm \
|
||||
erasure_code/gf_3vect_dot_prod_avx.asm \
|
||||
erasure_code/gf_4vect_dot_prod_avx.asm \
|
||||
erasure_code/gf_5vect_dot_prod_avx.asm \
|
||||
erasure_code/gf_6vect_dot_prod_avx.asm \
|
||||
erasure_code/gf_2vect_dot_prod_avx2.asm \
|
||||
erasure_code/gf_3vect_dot_prod_avx2.asm \
|
||||
erasure_code/gf_4vect_dot_prod_avx2.asm \
|
||||
erasure_code/gf_5vect_dot_prod_avx2.asm \
|
||||
erasure_code/gf_6vect_dot_prod_avx2.asm \
|
||||
erasure_code/gf_vect_mad_sse.asm \
|
||||
erasure_code/gf_2vect_mad_sse.asm \
|
||||
erasure_code/gf_3vect_mad_sse.asm \
|
||||
erasure_code/gf_4vect_mad_sse.asm \
|
||||
erasure_code/gf_5vect_mad_sse.asm \
|
||||
erasure_code/gf_6vect_mad_sse.asm \
|
||||
erasure_code/gf_vect_mad_avx.asm \
|
||||
erasure_code/gf_2vect_mad_avx.asm \
|
||||
erasure_code/gf_3vect_mad_avx.asm \
|
||||
erasure_code/gf_4vect_mad_avx.asm \
|
||||
erasure_code/gf_5vect_mad_avx.asm \
|
||||
erasure_code/gf_6vect_mad_avx.asm \
|
||||
erasure_code/gf_vect_mad_avx2.asm \
|
||||
erasure_code/gf_2vect_mad_avx2.asm \
|
||||
erasure_code/gf_3vect_mad_avx2.asm \
|
||||
erasure_code/gf_4vect_mad_avx2.asm \
|
||||
erasure_code/gf_5vect_mad_avx2.asm \
|
||||
erasure_code/gf_6vect_mad_avx2.asm \
|
||||
erasure_code/ec_multibinary.asm
|
||||
|
||||
lsrc32 += erasure_code/ec_highlevel_func.c \
|
||||
erasure_code/ec_multibinary.asm \
|
||||
erasure_code/ec_base.c \
|
||||
erasure_code/gf_vect_dot_prod_avx.asm \
|
||||
erasure_code/gf_2vect_dot_prod_avx.asm \
|
||||
erasure_code/gf_3vect_dot_prod_avx.asm \
|
||||
erasure_code/gf_4vect_dot_prod_avx.asm \
|
||||
erasure_code/gf_vect_dot_prod_sse.asm \
|
||||
erasure_code/gf_2vect_dot_prod_sse.asm \
|
||||
erasure_code/gf_3vect_dot_prod_sse.asm \
|
||||
erasure_code/gf_4vect_dot_prod_sse.asm \
|
||||
erasure_code/gf_vect_dot_prod_avx2.asm \
|
||||
erasure_code/gf_2vect_dot_prod_avx2.asm \
|
||||
erasure_code/gf_3vect_dot_prod_avx2.asm \
|
||||
erasure_code/gf_4vect_dot_prod_avx2.asm
|
||||
|
||||
unit_tests32 += erasure_code_base_test \
|
||||
erasure_code/erasure_code_test \
|
||||
erasure_code/erasure_code_sse_test \
|
||||
erasure_code/gf_vect_mul_test \
|
||||
erasure_code/gf_vect_mul_base_test \
|
||||
erasure_code/gf_vect_dot_prod_base_test \
|
||||
erasure_code/gf_vect_dot_prod_test \
|
||||
erasure_code/gf_vect_dot_prod_avx_test \
|
||||
erasure_code/gf_vect_dot_prod_sse_test \
|
||||
erasure_code/gf_2vect_dot_prod_sse_test \
|
||||
erasure_code/gf_3vect_dot_prod_sse_test \
|
||||
erasure_code/gf_4vect_dot_prod_sse_test
|
||||
|
||||
perf_tests32 += erasure_code/gf_vect_mul_perf \
|
||||
erasure_code/gf_vect_dot_prod_perf \
|
||||
erasure_code/erasure_code_perf \
|
||||
erasure_code/erasure_code_base_perf \
|
||||
erasure_code/erasure_code_sse_perf \
|
||||
erasure_code/gf_vect_dot_prod_1tbl \
|
||||
erasure_code/gf_vect_dot_prod_avx_perf\
|
||||
erasure_code/gf_vect_dot_prod_sse_perf\
|
||||
erasure_code/gf_2vect_dot_prod_sse_perf \
|
||||
erasure_code/gf_3vect_dot_prod_sse_perf \
|
||||
erasure_code/gf_4vect_dot_prod_sse_perf
|
||||
|
||||
extern_hdrs += include/erasure_code.h \
|
||||
include/gf_vect_mul.h
|
||||
|
||||
other_src += erasure_code/ec_base.h \
|
||||
include/reg_sizes.asm
|
||||
|
||||
check_tests += erasure_code/gf_vect_mul_test \
|
||||
erasure_code/erasure_code_test \
|
||||
erasure_code/gf_inverse_test \
|
||||
erasure_code/erasure_code_update_test
|
||||
|
||||
unit_tests += erasure_code/gf_vect_mul_sse_test \
|
||||
erasure_code/gf_vect_mul_avx_test \
|
||||
erasure_code/gf_vect_mul_base_test \
|
||||
erasure_code/gf_vect_dot_prod_sse_test \
|
||||
erasure_code/gf_vect_dot_prod_avx_test \
|
||||
erasure_code/gf_2vect_dot_prod_sse_test \
|
||||
erasure_code/gf_3vect_dot_prod_sse_test \
|
||||
erasure_code/gf_4vect_dot_prod_sse_test \
|
||||
erasure_code/gf_5vect_dot_prod_sse_test \
|
||||
erasure_code/gf_6vect_dot_prod_sse_test \
|
||||
erasure_code/gf_vect_dot_prod_base_test \
|
||||
erasure_code/gf_vect_dot_prod_test \
|
||||
erasure_code/gf_vect_mad_test \
|
||||
erasure_code/erasure_code_base_test \
|
||||
erasure_code/erasure_code_sse_test
|
||||
|
||||
perf_tests += erasure_code/gf_vect_mul_perf \
|
||||
erasure_code/gf_vect_mul_sse_perf \
|
||||
erasure_code/gf_vect_mul_avx_perf \
|
||||
erasure_code/gf_vect_dot_prod_sse_perf \
|
||||
erasure_code/gf_vect_dot_prod_avx_perf \
|
||||
erasure_code/gf_2vect_dot_prod_sse_perf \
|
||||
erasure_code/gf_3vect_dot_prod_sse_perf \
|
||||
erasure_code/gf_4vect_dot_prod_sse_perf \
|
||||
erasure_code/gf_5vect_dot_prod_sse_perf \
|
||||
erasure_code/gf_6vect_dot_prod_sse_perf \
|
||||
erasure_code/gf_vect_dot_prod_perf \
|
||||
erasure_code/gf_vect_dot_prod_1tbl \
|
||||
erasure_code/gf_vect_mad_perf \
|
||||
erasure_code/erasure_code_perf \
|
||||
erasure_code/erasure_code_base_perf \
|
||||
erasure_code/erasure_code_sse_perf \
|
||||
erasure_code/erasure_code_update_perf
|
||||
|
||||
other_src += include/test.h \
|
||||
include/types.h
|
360
erasure_code/ec_base.c
Normal file
360
erasure_code/ec_base.c
Normal file
@ -0,0 +1,360 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <limits.h>
|
||||
#include <string.h> // for memset
|
||||
#include "erasure_code.h"
|
||||
#include "ec_base.h" // for GF tables
|
||||
#include "types.h"
|
||||
|
||||
unsigned char gf_mul(unsigned char a, unsigned char b)
|
||||
{
|
||||
#ifndef GF_LARGE_TABLES
|
||||
int i;
|
||||
|
||||
if ((a == 0) || (b == 0))
|
||||
return 0;
|
||||
|
||||
return gff_base[(i = gflog_base[a] + gflog_base[b]) > 254 ? i - 255 : i];
|
||||
#else
|
||||
return gf_mul_table_base[b * 256 + a];
|
||||
#endif
|
||||
}
|
||||
|
||||
unsigned char gf_inv(unsigned char a)
|
||||
{
|
||||
#ifndef GF_LARGE_TABLES
|
||||
if (a == 0)
|
||||
return 0;
|
||||
|
||||
return gff_base[255 - gflog_base[a]];
|
||||
#else
|
||||
return gf_inv_table_base[a];
|
||||
#endif
|
||||
}
|
||||
|
||||
void gf_gen_rs_matrix(unsigned char *a, int m, int k)
|
||||
{
|
||||
int i, j;
|
||||
unsigned char p, gen = 1;
|
||||
|
||||
memset(a, 0, k * m);
|
||||
for (i = 0; i < k; i++)
|
||||
a[k * i + i] = 1;
|
||||
|
||||
for (i = k; i < m; i++) {
|
||||
p = 1;
|
||||
for (j = 0; j < k; j++) {
|
||||
a[k * i + j] = p;
|
||||
p = gf_mul(p, gen);
|
||||
}
|
||||
gen = gf_mul(gen, 2);
|
||||
}
|
||||
}
|
||||
|
||||
void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k)
|
||||
{
|
||||
int i, j;
|
||||
unsigned char *p;
|
||||
|
||||
// Identity matrix in high position
|
||||
memset(a, 0, k * m);
|
||||
for (i = 0; i < k; i++)
|
||||
a[k * i + i] = 1;
|
||||
|
||||
// For the rest choose 1/(i + j) | i != j
|
||||
p = &a[k * k];
|
||||
for (i = k; i < m; i++)
|
||||
for (j = 0; j < k; j++)
|
||||
*p++ = gf_inv(i ^ j);
|
||||
|
||||
}
|
||||
|
||||
int gf_invert_matrix(unsigned char *in_mat, unsigned char *out_mat, const int n)
|
||||
{
|
||||
int i, j, k;
|
||||
unsigned char temp;
|
||||
|
||||
// Set out_mat[] to the identity matrix
|
||||
for (i = 0; i < n * n; i++) // memset(out_mat, 0, n*n)
|
||||
out_mat[i] = 0;
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
out_mat[i * n + i] = 1;
|
||||
|
||||
// Inverse
|
||||
for (i = 0; i < n; i++) {
|
||||
// Check for 0 in pivot element
|
||||
if (in_mat[i * n + i] == 0) {
|
||||
// Find a row with non-zero in current column and swap
|
||||
for (j = i + 1; j < n; j++)
|
||||
if (in_mat[j * n + i])
|
||||
break;
|
||||
|
||||
if (j == n) // Couldn't find means it's singular
|
||||
return -1;
|
||||
|
||||
for (k = 0; k < n; k++) { // Swap rows i,j
|
||||
temp = in_mat[i * n + k];
|
||||
in_mat[i * n + k] = in_mat[j * n + k];
|
||||
in_mat[j * n + k] = temp;
|
||||
|
||||
temp = out_mat[i * n + k];
|
||||
out_mat[i * n + k] = out_mat[j * n + k];
|
||||
out_mat[j * n + k] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
temp = gf_inv(in_mat[i * n + i]); // 1/pivot
|
||||
for (j = 0; j < n; j++) { // Scale row i by 1/pivot
|
||||
in_mat[i * n + j] = gf_mul(in_mat[i * n + j], temp);
|
||||
out_mat[i * n + j] = gf_mul(out_mat[i * n + j], temp);
|
||||
}
|
||||
|
||||
for (j = 0; j < n; j++) {
|
||||
if (j == i)
|
||||
continue;
|
||||
|
||||
temp = in_mat[j * n + i];
|
||||
for (k = 0; k < n; k++) {
|
||||
out_mat[j * n + k] ^= gf_mul(temp, out_mat[i * n + k]);
|
||||
in_mat[j * n + k] ^= gf_mul(temp, in_mat[i * n + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Calculates const table gftbl in GF(2^8) from single input A
|
||||
// gftbl(A) = {A{00}, A{01}, A{02}, ... , A{0f} }, {A{00}, A{10}, A{20}, ... , A{f0} }
|
||||
|
||||
void gf_vect_mul_init(unsigned char c, unsigned char *tbl)
|
||||
{
|
||||
unsigned char c2 = (c << 1) ^ ((c & 0x80) ? 0x1d : 0); //Mult by GF{2}
|
||||
unsigned char c4 = (c2 << 1) ^ ((c2 & 0x80) ? 0x1d : 0); //Mult by GF{2}
|
||||
unsigned char c8 = (c4 << 1) ^ ((c4 & 0x80) ? 0x1d : 0); //Mult by GF{2}
|
||||
|
||||
#if __WORDSIZE == 64 || _WIN64 || __x86_64__
|
||||
unsigned long long v1, v2, v4, v8, *t;
|
||||
unsigned long long v10, v20, v40, v80;
|
||||
unsigned char c17, c18, c20, c24;
|
||||
|
||||
t = (unsigned long long *)tbl;
|
||||
|
||||
v1 = c * 0x0100010001000100ull;
|
||||
v2 = c2 * 0x0101000001010000ull;
|
||||
v4 = c4 * 0x0101010100000000ull;
|
||||
v8 = c8 * 0x0101010101010101ull;
|
||||
|
||||
v4 = v1 ^ v2 ^ v4;
|
||||
t[0] = v4;
|
||||
t[1] = v8 ^ v4;
|
||||
|
||||
c17 = (c8 << 1) ^ ((c8 & 0x80) ? 0x1d : 0); //Mult by GF{2}
|
||||
c18 = (c17 << 1) ^ ((c17 & 0x80) ? 0x1d : 0); //Mult by GF{2}
|
||||
c20 = (c18 << 1) ^ ((c18 & 0x80) ? 0x1d : 0); //Mult by GF{2}
|
||||
c24 = (c20 << 1) ^ ((c20 & 0x80) ? 0x1d : 0); //Mult by GF{2}
|
||||
|
||||
v10 = c17 * 0x0100010001000100ull;
|
||||
v20 = c18 * 0x0101000001010000ull;
|
||||
v40 = c20 * 0x0101010100000000ull;
|
||||
v80 = c24 * 0x0101010101010101ull;
|
||||
|
||||
v40 = v10 ^ v20 ^ v40;
|
||||
t[2] = v40;
|
||||
t[3] = v80 ^ v40;
|
||||
|
||||
#else // 32-bit or other
|
||||
unsigned char c3, c5, c6, c7, c9, c10, c11, c12, c13, c14, c15;
|
||||
unsigned char c17, c18, c19, c20, c21, c22, c23, c24, c25, c26, c27, c28, c29, c30,
|
||||
c31;
|
||||
|
||||
c3 = c2 ^ c;
|
||||
c5 = c4 ^ c;
|
||||
c6 = c4 ^ c2;
|
||||
c7 = c4 ^ c3;
|
||||
|
||||
c9 = c8 ^ c;
|
||||
c10 = c8 ^ c2;
|
||||
c11 = c8 ^ c3;
|
||||
c12 = c8 ^ c4;
|
||||
c13 = c8 ^ c5;
|
||||
c14 = c8 ^ c6;
|
||||
c15 = c8 ^ c7;
|
||||
|
||||
tbl[0] = 0;
|
||||
tbl[1] = c;
|
||||
tbl[2] = c2;
|
||||
tbl[3] = c3;
|
||||
tbl[4] = c4;
|
||||
tbl[5] = c5;
|
||||
tbl[6] = c6;
|
||||
tbl[7] = c7;
|
||||
tbl[8] = c8;
|
||||
tbl[9] = c9;
|
||||
tbl[10] = c10;
|
||||
tbl[11] = c11;
|
||||
tbl[12] = c12;
|
||||
tbl[13] = c13;
|
||||
tbl[14] = c14;
|
||||
tbl[15] = c15;
|
||||
|
||||
c17 = (c8 << 1) ^ ((c8 & 0x80) ? 0x1d : 0); //Mult by GF{2}
|
||||
c18 = (c17 << 1) ^ ((c17 & 0x80) ? 0x1d : 0); //Mult by GF{2}
|
||||
c19 = c18 ^ c17;
|
||||
c20 = (c18 << 1) ^ ((c18 & 0x80) ? 0x1d : 0); //Mult by GF{2}
|
||||
c21 = c20 ^ c17;
|
||||
c22 = c20 ^ c18;
|
||||
c23 = c20 ^ c19;
|
||||
c24 = (c20 << 1) ^ ((c20 & 0x80) ? 0x1d : 0); //Mult by GF{2}
|
||||
c25 = c24 ^ c17;
|
||||
c26 = c24 ^ c18;
|
||||
c27 = c24 ^ c19;
|
||||
c28 = c24 ^ c20;
|
||||
c29 = c24 ^ c21;
|
||||
c30 = c24 ^ c22;
|
||||
c31 = c24 ^ c23;
|
||||
|
||||
tbl[16] = 0;
|
||||
tbl[17] = c17;
|
||||
tbl[18] = c18;
|
||||
tbl[19] = c19;
|
||||
tbl[20] = c20;
|
||||
tbl[21] = c21;
|
||||
tbl[22] = c22;
|
||||
tbl[23] = c23;
|
||||
tbl[24] = c24;
|
||||
tbl[25] = c25;
|
||||
tbl[26] = c26;
|
||||
tbl[27] = c27;
|
||||
tbl[28] = c28;
|
||||
tbl[29] = c29;
|
||||
tbl[30] = c30;
|
||||
tbl[31] = c31;
|
||||
|
||||
#endif //__WORDSIZE == 64 || _WIN64 || __x86_64__
|
||||
}
|
||||
|
||||
void gf_vect_dot_prod_base(int len, int vlen, unsigned char *v,
|
||||
unsigned char **src, unsigned char *dest)
|
||||
{
|
||||
int i, j;
|
||||
unsigned char s;
|
||||
for (i = 0; i < len; i++) {
|
||||
s = 0;
|
||||
for (j = 0; j < vlen; j++)
|
||||
s ^= gf_mul(src[j][i], v[j * 32 + 1]);
|
||||
|
||||
dest[i] = s;
|
||||
}
|
||||
}
|
||||
|
||||
void gf_vect_mad_base(int len, int vec, int vec_i,
|
||||
unsigned char *v, unsigned char *src, unsigned char *dest)
|
||||
{
|
||||
int i;
|
||||
unsigned char s;
|
||||
for (i = 0; i < len; i++) {
|
||||
s = dest[i];
|
||||
s ^= gf_mul(src[i], v[vec_i * 32 + 1]);
|
||||
dest[i] = s;
|
||||
}
|
||||
}
|
||||
|
||||
void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v,
|
||||
unsigned char **src, unsigned char **dest)
|
||||
{
|
||||
int i, j, l;
|
||||
unsigned char s;
|
||||
|
||||
for (l = 0; l < dests; l++) {
|
||||
for (i = 0; i < len; i++) {
|
||||
s = 0;
|
||||
for (j = 0; j < srcs; j++)
|
||||
s ^= gf_mul(src[j][i], v[j * 32 + l * srcs * 32 + 1]);
|
||||
|
||||
dest[l][i] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v,
|
||||
unsigned char *data, unsigned char **dest)
|
||||
{
|
||||
int i, l;
|
||||
unsigned char s;
|
||||
|
||||
for (l = 0; l < rows; l++) {
|
||||
for (i = 0; i < len; i++) {
|
||||
s = dest[l][i];
|
||||
s ^= gf_mul(data[i], v[vec_i * 32 + l * k * 32 + 1]);
|
||||
|
||||
dest[l][i] = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, unsigned char *dest)
|
||||
{
|
||||
//2nd element of table array is ref value used to fill it in
|
||||
unsigned char c = a[1];
|
||||
while (len-- > 0)
|
||||
*dest++ = gf_mul(c, *src++);
|
||||
}
|
||||
|
||||
struct slver {
|
||||
UINT16 snum;
|
||||
UINT8 ver;
|
||||
UINT8 core;
|
||||
};
|
||||
|
||||
// Version info
|
||||
struct slver gf_vect_mul_init_slver_00020035;
|
||||
struct slver gf_vect_mul_init_slver = { 0x0035, 0x02, 0x00 };
|
||||
|
||||
struct slver ec_encode_data_base_slver_00010135;
|
||||
struct slver ec_encode_data_base_slver = { 0x0135, 0x01, 0x00 };
|
||||
|
||||
struct slver gf_vect_mul_base_slver_00010136;
|
||||
struct slver gf_vect_mul_base_slver = { 0x0136, 0x01, 0x00 };
|
||||
|
||||
struct slver gf_vect_dot_prod_base_slver_00010137;
|
||||
struct slver gf_vect_dot_prod_base_slver = { 0x0137, 0x01, 0x00 };
|
||||
|
||||
struct slver gf_mul_slver_00000214;
|
||||
struct slver gf_mul_slver = { 0x0214, 0x00, 0x00 };
|
||||
|
||||
struct slver gf_invert_matrix_slver_00000215;
|
||||
struct slver gf_invert_matrix_slver = { 0x0215, 0x00, 0x00};
|
||||
|
||||
struct slver gf_gen_rs_matrix_slver_00000216;
|
||||
struct slver gf_gen_rs_matrix_slver = { 0x0216, 0x00, 0x00 };
|
||||
|
||||
struct slver gf_gen_cauchy1_matrix_slver_00000217;
|
||||
struct slver gf_gen_cauchy1_matrix_slver = { 0x0217, 0x00, 0x00};
|
6680
erasure_code/ec_base.h
Normal file
6680
erasure_code/ec_base.h
Normal file
File diff suppressed because it is too large
Load Diff
267
erasure_code/ec_highlevel_func.c
Normal file
267
erasure_code/ec_highlevel_func.c
Normal file
@ -0,0 +1,267 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
#include <limits.h>
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
void ec_init_tables(int k, int rows, unsigned char *a, unsigned char *g_tbls)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < rows; i++) {
|
||||
for (j = 0; j < k; j++) {
|
||||
gf_vect_mul_init(*a++, g_tbls);
|
||||
g_tbls += 32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
|
||||
unsigned char **coding)
|
||||
{
|
||||
|
||||
if (len < 16) {
|
||||
ec_encode_data_base(len, k, rows, g_tbls, data, coding);
|
||||
return;
|
||||
}
|
||||
|
||||
while (rows >= 4) {
|
||||
gf_4vect_dot_prod_sse(len, k, g_tbls, data, coding);
|
||||
g_tbls += 4 * k * 32;
|
||||
coding += 4;
|
||||
rows -= 4;
|
||||
}
|
||||
switch (rows) {
|
||||
case 3:
|
||||
gf_3vect_dot_prod_sse(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 2:
|
||||
gf_2vect_dot_prod_sse(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 1:
|
||||
gf_vect_dot_prod_sse(len, k, g_tbls, data, *coding);
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void ec_encode_data_avx(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
|
||||
unsigned char **coding)
|
||||
{
|
||||
if (len < 16) {
|
||||
ec_encode_data_base(len, k, rows, g_tbls, data, coding);
|
||||
return;
|
||||
}
|
||||
|
||||
while (rows >= 4) {
|
||||
gf_4vect_dot_prod_avx(len, k, g_tbls, data, coding);
|
||||
g_tbls += 4 * k * 32;
|
||||
coding += 4;
|
||||
rows -= 4;
|
||||
}
|
||||
switch (rows) {
|
||||
case 3:
|
||||
gf_3vect_dot_prod_avx(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 2:
|
||||
gf_2vect_dot_prod_avx(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 1:
|
||||
gf_vect_dot_prod_avx(len, k, g_tbls, data, *coding);
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void ec_encode_data_avx2(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
|
||||
unsigned char **coding)
|
||||
{
|
||||
|
||||
if (len < 32) {
|
||||
ec_encode_data_base(len, k, rows, g_tbls, data, coding);
|
||||
return;
|
||||
}
|
||||
|
||||
while (rows >= 4) {
|
||||
gf_4vect_dot_prod_avx2(len, k, g_tbls, data, coding);
|
||||
g_tbls += 4 * k * 32;
|
||||
coding += 4;
|
||||
rows -= 4;
|
||||
}
|
||||
switch (rows) {
|
||||
case 3:
|
||||
gf_3vect_dot_prod_avx2(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 2:
|
||||
gf_2vect_dot_prod_avx2(len, k, g_tbls, data, coding);
|
||||
break;
|
||||
case 1:
|
||||
gf_vect_dot_prod_avx2(len, k, g_tbls, data, *coding);
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#if __WORDSIZE == 64 || _WIN64 || __x86_64__
|
||||
|
||||
void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
|
||||
unsigned char *data, unsigned char **coding)
|
||||
{
|
||||
if (len < 16) {
|
||||
ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
|
||||
return;
|
||||
}
|
||||
|
||||
while (rows > 6) {
|
||||
gf_6vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
|
||||
g_tbls += 6 * k * 32;
|
||||
coding += 6;
|
||||
rows -= 6;
|
||||
}
|
||||
switch (rows) {
|
||||
case 6:
|
||||
gf_6vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 5:
|
||||
gf_5vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 4:
|
||||
gf_4vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 3:
|
||||
gf_3vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 2:
|
||||
gf_2vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 1:
|
||||
gf_vect_mad_sse(len, k, vec_i, g_tbls, data, *coding);
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
|
||||
unsigned char *data, unsigned char **coding)
|
||||
{
|
||||
if (len < 16) {
|
||||
ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
|
||||
return;
|
||||
}
|
||||
while (rows > 6) {
|
||||
gf_6vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
|
||||
g_tbls += 6 * k * 32;
|
||||
coding += 6;
|
||||
rows -= 6;
|
||||
}
|
||||
switch (rows) {
|
||||
case 6:
|
||||
gf_6vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 5:
|
||||
gf_5vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 4:
|
||||
gf_4vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 3:
|
||||
gf_3vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 2:
|
||||
gf_2vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 1:
|
||||
gf_vect_mad_avx(len, k, vec_i, g_tbls, data, *coding);
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
|
||||
unsigned char *data, unsigned char **coding)
|
||||
{
|
||||
if (len < 32) {
|
||||
ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
|
||||
return;
|
||||
}
|
||||
while (rows > 6) {
|
||||
gf_6vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
|
||||
g_tbls += 6 * k * 32;
|
||||
coding += 6;
|
||||
rows -= 6;
|
||||
}
|
||||
switch (rows) {
|
||||
case 6:
|
||||
gf_6vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 5:
|
||||
gf_5vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 4:
|
||||
gf_4vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 3:
|
||||
gf_3vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 2:
|
||||
gf_2vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
|
||||
break;
|
||||
case 1:
|
||||
gf_vect_mad_avx2(len, k, vec_i, g_tbls, data, *coding);
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif //__WORDSIZE == 64 || _WIN64 || __x86_64__
|
||||
|
||||
struct slver {
|
||||
UINT16 snum;
|
||||
UINT8 ver;
|
||||
UINT8 core;
|
||||
};
|
||||
|
||||
// Version info
|
||||
struct slver ec_init_tables_slver_00010068;
|
||||
struct slver ec_init_tables_slver = { 0x0068, 0x01, 0x00 };
|
||||
|
||||
struct slver ec_encode_data_sse_slver_00020069;
|
||||
struct slver ec_encode_data_sse_slver = { 0x0069, 0x02, 0x00 };
|
395
erasure_code/ec_multibinary.asm
Normal file
395
erasure_code/ec_multibinary.asm
Normal file
@ -0,0 +1,395 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define WRT_OPT wrt ..plt
|
||||
%else
|
||||
%define WRT_OPT
|
||||
%endif
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
|
||||
[bits 32]
|
||||
|
||||
%define def_wrd dd
|
||||
%define wrd_sz dword
|
||||
%define arg1 esi
|
||||
%define arg2 eax
|
||||
%define arg3 ebx
|
||||
%define arg4 ecx
|
||||
%define arg5 edx
|
||||
|
||||
%else
|
||||
|
||||
default rel
|
||||
[bits 64]
|
||||
|
||||
%define def_wrd dq
|
||||
%define wrd_sz qword
|
||||
%define arg1 rsi
|
||||
%define arg2 rax
|
||||
%define arg3 rbx
|
||||
%define arg4 rcx
|
||||
%define arg5 rdx
|
||||
|
||||
|
||||
extern ec_encode_data_update_sse
|
||||
extern ec_encode_data_update_avx
|
||||
extern ec_encode_data_update_avx2
|
||||
extern gf_vect_mul_sse
|
||||
extern gf_vect_mul_avx
|
||||
|
||||
extern gf_vect_mad_sse
|
||||
extern gf_vect_mad_avx
|
||||
extern gf_vect_mad_avx2
|
||||
%endif
|
||||
|
||||
extern gf_vect_mul_base
|
||||
extern ec_encode_data_base
|
||||
extern ec_encode_data_update_base
|
||||
extern gf_vect_dot_prod_base
|
||||
extern gf_vect_mad_base
|
||||
|
||||
extern gf_vect_dot_prod_sse
|
||||
extern gf_vect_dot_prod_avx
|
||||
extern gf_vect_dot_prod_avx2
|
||||
extern ec_encode_data_sse
|
||||
extern ec_encode_data_avx
|
||||
extern ec_encode_data_avx2
|
||||
|
||||
|
||||
section .data
|
||||
;;; *_mbinit are initial values for *_dispatched; is updated on first call.
|
||||
;;; Therefore, *_dispatch_init is only executed on first call.
|
||||
|
||||
ec_encode_data_dispatched:
|
||||
def_wrd ec_encode_data_mbinit
|
||||
|
||||
gf_vect_mul_dispatched:
|
||||
def_wrd gf_vect_mul_mbinit
|
||||
|
||||
gf_vect_dot_prod_dispatched:
|
||||
def_wrd gf_vect_dot_prod_mbinit
|
||||
|
||||
ec_encode_data_update_dispatched:
|
||||
def_wrd ec_encode_data_update_mbinit
|
||||
|
||||
gf_vect_mad_dispatched:
|
||||
def_wrd gf_vect_mad_mbinit
|
||||
|
||||
section .text
|
||||
;;;;
|
||||
; ec_encode_data multibinary function
|
||||
;;;;
|
||||
global ec_encode_data:function
|
||||
ec_encode_data_mbinit:
|
||||
call ec_encode_data_dispatch_init
|
||||
|
||||
ec_encode_data:
|
||||
jmp wrd_sz [ec_encode_data_dispatched]
|
||||
|
||||
ec_encode_data_dispatch_init:
|
||||
push arg1
|
||||
push arg2
|
||||
push arg3
|
||||
push arg4
|
||||
push arg5
|
||||
lea arg1, [ec_encode_data_base WRT_OPT] ; Default
|
||||
|
||||
mov eax, 1
|
||||
cpuid
|
||||
lea arg3, [ec_encode_data_sse WRT_OPT]
|
||||
test ecx, FLAG_CPUID1_ECX_SSE4_1
|
||||
cmovne arg1, arg3
|
||||
|
||||
and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
|
||||
cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
|
||||
lea arg3, [ec_encode_data_avx WRT_OPT]
|
||||
|
||||
jne _done_ec_encode_data_init
|
||||
mov arg1, arg3
|
||||
|
||||
;; Try for AVX2
|
||||
xor ecx, ecx
|
||||
mov eax, 7
|
||||
cpuid
|
||||
test ebx, FLAG_CPUID1_EBX_AVX2
|
||||
lea arg3, [ec_encode_data_avx2 WRT_OPT]
|
||||
cmovne arg1, arg3
|
||||
;; Does it have xmm and ymm support
|
||||
xor ecx, ecx
|
||||
xgetbv
|
||||
and eax, FLAG_XGETBV_EAX_XMM_YMM
|
||||
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
|
||||
je _done_ec_encode_data_init
|
||||
lea arg1, [ec_encode_data_sse WRT_OPT]
|
||||
|
||||
_done_ec_encode_data_init:
|
||||
pop arg5
|
||||
pop arg4
|
||||
pop arg3
|
||||
pop arg2
|
||||
mov [ec_encode_data_dispatched], arg1
|
||||
pop arg1
|
||||
ret
|
||||
|
||||
;;;;
|
||||
; gf_vect_mul multibinary function
|
||||
;;;;
|
||||
global gf_vect_mul:function
|
||||
gf_vect_mul_mbinit:
|
||||
call gf_vect_mul_dispatch_init
|
||||
|
||||
gf_vect_mul:
|
||||
jmp wrd_sz [gf_vect_mul_dispatched]
|
||||
|
||||
gf_vect_mul_dispatch_init:
|
||||
push arg1
|
||||
%ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check
|
||||
lea arg1, [gf_vect_mul_base]
|
||||
%else
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
lea arg1, [gf_vect_mul_base WRT_OPT] ; Default
|
||||
|
||||
mov eax, 1
|
||||
cpuid
|
||||
test ecx, FLAG_CPUID1_ECX_SSE4_2
|
||||
lea rbx, [gf_vect_mul_sse WRT_OPT]
|
||||
je _done_gf_vect_mul_dispatch_init
|
||||
mov arg1, rbx
|
||||
|
||||
;; Try for AVX
|
||||
and ecx, (FLAG_CPUID1_ECX_OSXSAVE | FLAG_CPUID1_ECX_AVX)
|
||||
cmp ecx, (FLAG_CPUID1_ECX_OSXSAVE | FLAG_CPUID1_ECX_AVX)
|
||||
jne _done_gf_vect_mul_dispatch_init
|
||||
|
||||
;; Does it have xmm and ymm support
|
||||
xor ecx, ecx
|
||||
xgetbv
|
||||
and eax, FLAG_XGETBV_EAX_XMM_YMM
|
||||
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
|
||||
jne _done_gf_vect_mul_dispatch_init
|
||||
lea arg1, [gf_vect_mul_avx WRT_OPT]
|
||||
|
||||
_done_gf_vect_mul_dispatch_init:
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
%endif ;; END 32-bit check
|
||||
mov [gf_vect_mul_dispatched], arg1
|
||||
pop arg1
|
||||
ret
|
||||
|
||||
;;;;
|
||||
; ec_encode_data_update multibinary function
|
||||
;;;;
|
||||
global ec_encode_data_update:function
|
||||
ec_encode_data_update_mbinit:
|
||||
call ec_encode_data_update_dispatch_init
|
||||
|
||||
ec_encode_data_update:
|
||||
jmp wrd_sz [ec_encode_data_update_dispatched]
|
||||
|
||||
ec_encode_data_update_dispatch_init:
|
||||
push arg1
|
||||
%ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check
|
||||
lea arg1, [ec_encode_data_update_base]
|
||||
%else
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
lea arg1, [ec_encode_data_update_base WRT_OPT] ; Default
|
||||
|
||||
mov eax, 1
|
||||
cpuid
|
||||
lea rbx, [ec_encode_data_update_sse WRT_OPT]
|
||||
test ecx, FLAG_CPUID1_ECX_SSE4_1
|
||||
cmovne arg1, rbx
|
||||
|
||||
and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
|
||||
cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
|
||||
lea rbx, [ec_encode_data_update_avx WRT_OPT]
|
||||
|
||||
jne _done_ec_encode_data_update_init
|
||||
mov rsi, rbx
|
||||
|
||||
;; Try for AVX2
|
||||
xor ecx, ecx
|
||||
mov eax, 7
|
||||
cpuid
|
||||
test ebx, FLAG_CPUID1_EBX_AVX2
|
||||
lea rbx, [ec_encode_data_update_avx2 WRT_OPT]
|
||||
cmovne rsi, rbx
|
||||
|
||||
;; Does it have xmm and ymm support
|
||||
xor ecx, ecx
|
||||
xgetbv
|
||||
and eax, FLAG_XGETBV_EAX_XMM_YMM
|
||||
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
|
||||
je _done_ec_encode_data_update_init
|
||||
lea rsi, [ec_encode_data_update_sse WRT_OPT]
|
||||
|
||||
_done_ec_encode_data_update_init:
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
%endif ;; END 32-bit check
|
||||
mov [ec_encode_data_update_dispatched], arg1
|
||||
pop arg1
|
||||
ret
|
||||
|
||||
;;;;
|
||||
; gf_vect_dot_prod multibinary function
|
||||
;;;;
|
||||
global gf_vect_dot_prod:function
|
||||
gf_vect_dot_prod_mbinit:
|
||||
call gf_vect_dot_prod_dispatch_init
|
||||
|
||||
gf_vect_dot_prod:
|
||||
jmp wrd_sz [gf_vect_dot_prod_dispatched]
|
||||
|
||||
gf_vect_dot_prod_dispatch_init:
|
||||
push arg1
|
||||
push arg2
|
||||
push arg3
|
||||
push arg4
|
||||
push arg5
|
||||
lea arg1, [gf_vect_dot_prod_base WRT_OPT] ; Default
|
||||
|
||||
mov eax, 1
|
||||
cpuid
|
||||
lea arg3, [gf_vect_dot_prod_sse WRT_OPT]
|
||||
test ecx, FLAG_CPUID1_ECX_SSE4_1
|
||||
cmovne arg1, arg3
|
||||
|
||||
and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
|
||||
cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
|
||||
lea arg3, [gf_vect_dot_prod_avx WRT_OPT]
|
||||
|
||||
jne _done_gf_vect_dot_prod_init
|
||||
mov arg1, arg3
|
||||
|
||||
;; Try for AVX2
|
||||
xor ecx, ecx
|
||||
mov eax, 7
|
||||
cpuid
|
||||
test ebx, FLAG_CPUID1_EBX_AVX2
|
||||
lea arg3, [gf_vect_dot_prod_avx2 WRT_OPT]
|
||||
cmovne arg1, arg3
|
||||
;; Does it have xmm and ymm support
|
||||
xor ecx, ecx
|
||||
xgetbv
|
||||
and eax, FLAG_XGETBV_EAX_XMM_YMM
|
||||
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
|
||||
je _done_gf_vect_dot_prod_init
|
||||
lea arg1, [gf_vect_dot_prod_sse WRT_OPT]
|
||||
|
||||
_done_gf_vect_dot_prod_init:
|
||||
pop arg5
|
||||
pop arg4
|
||||
pop arg3
|
||||
pop arg2
|
||||
mov [gf_vect_dot_prod_dispatched], arg1
|
||||
pop arg1
|
||||
ret
|
||||
|
||||
;;;;
|
||||
; gf_vect_mad multibinary function
|
||||
;;;;
|
||||
global gf_vect_mad:function
|
||||
gf_vect_mad_mbinit:
|
||||
call gf_vect_mad_dispatch_init
|
||||
|
||||
gf_vect_mad:
|
||||
jmp wrd_sz [gf_vect_mad_dispatched]
|
||||
|
||||
gf_vect_mad_dispatch_init:
|
||||
push arg1
|
||||
%ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check
|
||||
lea arg1, [gf_vect_mad_base]
|
||||
%else
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
lea arg1, [gf_vect_mad_base WRT_OPT] ; Default
|
||||
|
||||
mov eax, 1
|
||||
cpuid
|
||||
lea rbx, [gf_vect_mad_sse WRT_OPT]
|
||||
test ecx, FLAG_CPUID1_ECX_SSE4_1
|
||||
cmovne arg1, rbx
|
||||
|
||||
and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
|
||||
cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
|
||||
lea rbx, [gf_vect_mad_avx WRT_OPT]
|
||||
|
||||
jne _done_gf_vect_mad_init
|
||||
mov rsi, rbx
|
||||
|
||||
;; Try for AVX2
|
||||
xor ecx, ecx
|
||||
mov eax, 7
|
||||
cpuid
|
||||
test ebx, FLAG_CPUID1_EBX_AVX2
|
||||
lea rbx, [gf_vect_mad_avx2 WRT_OPT]
|
||||
cmovne rsi, rbx
|
||||
|
||||
;; Does it have xmm and ymm support
|
||||
xor ecx, ecx
|
||||
xgetbv
|
||||
and eax, FLAG_XGETBV_EAX_XMM_YMM
|
||||
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
|
||||
je _done_gf_vect_mad_init
|
||||
lea rsi, [gf_vect_mad_sse WRT_OPT]
|
||||
|
||||
_done_gf_vect_mad_init:
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
%endif ;; END 32-bit check
|
||||
mov [gf_vect_mad_dispatched], arg1
|
||||
pop arg1
|
||||
ret
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion ec_encode_data, 00, 04, 0133
|
||||
slversion gf_vect_mul, 00, 03, 0134
|
||||
slversion ec_encode_data_update, 00, 03, 0212
|
||||
slversion gf_vect_dot_prod, 00, 03, 0138
|
||||
slversion gf_vect_mad, 00, 02, 0213
|
168
erasure_code/erasure_code_base_perf.c
Normal file
168
erasure_code/erasure_code_base_perf.c
Normal file
@ -0,0 +1,168 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 32
|
||||
# define TEST_LEN(m) ((128*1024 / m) & ~(64-1))
|
||||
# define TEST_LOOPS(m) (100*m)
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 32
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
|
||||
# define TEST_LOOPS(m) (10)
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS(m) 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define MMAX TEST_SOURCES
|
||||
#define KMAX TEST_SOURCES
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, m, k, nerrs, r;
|
||||
void *buf;
|
||||
u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
|
||||
u8 a[MMAX * KMAX], b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX];
|
||||
u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
|
||||
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
|
||||
struct perf start, stop;
|
||||
|
||||
// Pick test parameters
|
||||
m = 14;
|
||||
k = 10;
|
||||
nerrs = 4;
|
||||
const u8 err_list[] = { 2, 4, 5, 7 };
|
||||
|
||||
printf("erasure_code_base_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
|
||||
|
||||
if (m > MMAX || k > KMAX || nerrs > (m - k)) {
|
||||
printf(" Input test parameter error\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
memcpy(src_err_list, err_list, nerrs);
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
for (i = 0; i < nerrs; i++)
|
||||
src_in_err[src_err_list[i]] = 1;
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < m; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
|
||||
printf("alloc error: Fail\n");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
for (i = 0; i < (m - k); i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
|
||||
printf("alloc error: Fail\n");
|
||||
return -1;
|
||||
}
|
||||
temp_buffs[i] = buf;
|
||||
}
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN(m); j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
gf_gen_rs_matrix(a, m, k);
|
||||
ec_init_tables(k, m - k, &a[k * k], g_tbls);
|
||||
ec_encode_data_base(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Start encode test
|
||||
perf_start(&start);
|
||||
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
|
||||
// Make parity vects
|
||||
ec_init_tables(k, m - k, &a[k * k], g_tbls);
|
||||
ec_encode_data_base(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("erasure_code_base_encode" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
|
||||
|
||||
// Start decode test
|
||||
perf_start(&start);
|
||||
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
|
||||
// Construct b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r])
|
||||
r++;
|
||||
recov[i] = buffs[r];
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * i + j] = a[k * r + j];
|
||||
}
|
||||
|
||||
if (gf_invert_matrix(b, d, k) < 0) {
|
||||
printf("BAD MATRIX\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (i = 0; i < nerrs; i++)
|
||||
for (j = 0; j < k; j++)
|
||||
c[k * i + j] = d[k * src_err_list[i] + j];
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, c, g_tbls);
|
||||
ec_encode_data_base(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
if (0 != memcmp(temp_buffs[i], buffs[src_err_list[i]], TEST_LEN(m))) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("erasure_code_base_decode" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)(TEST_LEN(m)) * (k + nerrs) * rtest);
|
||||
|
||||
printf("done all: Pass\n");
|
||||
return 0;
|
||||
}
|
764
erasure_code/erasure_code_base_test.c
Normal file
764
erasure_code/erasure_code_base_test.c
Normal file
@ -0,0 +1,764 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 127
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 50
|
||||
#endif
|
||||
|
||||
#define MMAX TEST_SOURCES
|
||||
#define KMAX TEST_SOURCES
|
||||
|
||||
#define EFENCE_TEST_MIN_SIZE 16
|
||||
|
||||
#ifdef EC_ALIGNED_ADDR
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 0
|
||||
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
|
||||
#else
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 32
|
||||
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
|
||||
#endif
|
||||
|
||||
#ifndef TEST_SEED
|
||||
#define TEST_SEED 11
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Generate Random errors
|
||||
static void gen_err_list(unsigned char *src_err_list,
|
||||
unsigned char *src_in_err, int *pnerrs, int *pnsrcerrs, int k, int m)
|
||||
{
|
||||
int i, err;
|
||||
int nerrs = 0, nsrcerrs = 0;
|
||||
|
||||
for (i = 0, nerrs = 0, nsrcerrs = 0; i < m && nerrs < m - k; i++) {
|
||||
err = 1 & rand();
|
||||
src_in_err[i] = err;
|
||||
if (err) {
|
||||
src_err_list[nerrs++] = i;
|
||||
if (i < k) {
|
||||
nsrcerrs++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nerrs == 0) { // should have at least one error
|
||||
while ((err = (rand() % KMAX)) >= m) ;
|
||||
src_err_list[nerrs++] = err;
|
||||
src_in_err[err] = 1;
|
||||
if (err < k)
|
||||
nsrcerrs = 1;
|
||||
}
|
||||
*pnerrs = nerrs;
|
||||
*pnsrcerrs = nsrcerrs;
|
||||
return;
|
||||
}
|
||||
|
||||
#define NO_INVERT_MATRIX -2
|
||||
// Generate decode matrix from encode matrix
|
||||
static int gf_gen_decode_matrix(unsigned char *encode_matrix,
|
||||
unsigned char *decode_matrix,
|
||||
unsigned char *invert_matrix,
|
||||
unsigned int *decode_index,
|
||||
unsigned char *src_err_list,
|
||||
unsigned char *src_in_err,
|
||||
int nerrs, int nsrcerrs, int k, int m)
|
||||
{
|
||||
int i, j, p;
|
||||
int r;
|
||||
unsigned char *backup, *b, s;
|
||||
int incr = 0;
|
||||
|
||||
b = malloc(MMAX * KMAX);
|
||||
backup = malloc(MMAX * KMAX);
|
||||
|
||||
if (b == NULL || backup == NULL) {
|
||||
printf("Test failure! Error with malloc\n");
|
||||
free(b);
|
||||
free(backup);
|
||||
return -1;
|
||||
}
|
||||
// Construct matrix b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r])
|
||||
r++;
|
||||
for (j = 0; j < k; j++) {
|
||||
b[k * i + j] = encode_matrix[k * r + j];
|
||||
backup[k * i + j] = encode_matrix[k * r + j];
|
||||
}
|
||||
decode_index[i] = r;
|
||||
}
|
||||
incr = 0;
|
||||
while (gf_invert_matrix(b, invert_matrix, k) < 0) {
|
||||
if (nerrs == (m - k)) {
|
||||
free(b);
|
||||
free(backup);
|
||||
printf("BAD MATRIX\n");
|
||||
return NO_INVERT_MATRIX;
|
||||
}
|
||||
incr++;
|
||||
memcpy(b, backup, MMAX * KMAX);
|
||||
for (i = nsrcerrs; i < nerrs - nsrcerrs; i++) {
|
||||
if (src_err_list[i] == (decode_index[k - 1] + incr)) {
|
||||
// skip the erased parity line
|
||||
incr++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (decode_index[k - 1] + incr >= m) {
|
||||
free(b);
|
||||
free(backup);
|
||||
printf("BAD MATRIX\n");
|
||||
return NO_INVERT_MATRIX;
|
||||
}
|
||||
decode_index[k - 1] += incr;
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * (k - 1) + j] = encode_matrix[k * decode_index[k - 1] + j];
|
||||
|
||||
};
|
||||
|
||||
for (i = 0; i < nsrcerrs; i++) {
|
||||
for (j = 0; j < k; j++) {
|
||||
decode_matrix[k * i + j] = invert_matrix[k * src_err_list[i] + j];
|
||||
}
|
||||
}
|
||||
/* src_err_list from encode_matrix * invert of b for parity decoding */
|
||||
for (p = nsrcerrs; p < nerrs; p++) {
|
||||
for (i = 0; i < k; i++) {
|
||||
s = 0;
|
||||
for (j = 0; j < k; j++)
|
||||
s ^= gf_mul(invert_matrix[j * k + i],
|
||||
encode_matrix[k * src_err_list[p] + j]);
|
||||
|
||||
decode_matrix[k * p + i] = s;
|
||||
}
|
||||
}
|
||||
free(b);
|
||||
free(backup);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int re = 0;
|
||||
int i, j, p, rtest, m, k;
|
||||
int nerrs, nsrcerrs;
|
||||
void *buf;
|
||||
unsigned int decode_index[MMAX];
|
||||
unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
|
||||
unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
|
||||
unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
|
||||
unsigned char *recov[TEST_SOURCES];
|
||||
|
||||
int rows, align, size;
|
||||
unsigned char *efence_buffs[TEST_SOURCES];
|
||||
unsigned int offset;
|
||||
u8 *ubuffs[TEST_SOURCES];
|
||||
u8 *temp_ubuffs[TEST_SOURCES];
|
||||
|
||||
printf("erasure_code_base_test: %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
srand(TEST_SEED);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
temp_buffs[i] = buf;
|
||||
}
|
||||
|
||||
// Test erasure code by encode and recovery
|
||||
|
||||
encode_matrix = malloc(MMAX * KMAX);
|
||||
decode_matrix = malloc(MMAX * KMAX);
|
||||
invert_matrix = malloc(MMAX * KMAX);
|
||||
g_tbls = malloc(KMAX * TEST_SOURCES * 32);
|
||||
if (encode_matrix == NULL || decode_matrix == NULL
|
||||
|| invert_matrix == NULL || g_tbls == NULL) {
|
||||
printf("Test failure! Error with malloc\n");
|
||||
return -1;
|
||||
}
|
||||
// Pick a first test
|
||||
m = 9;
|
||||
k = 5;
|
||||
if (m > MMAX || k > KMAX)
|
||||
return -1;
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// Generate encode matrix encode_matrix
|
||||
// The matrix generated by gf_gen_rs_matrix
|
||||
// is not always invertable.
|
||||
gf_gen_rs_matrix(encode_matrix, m, k);
|
||||
|
||||
// Generate g_tbls from encode matrix encode_matrix
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix encode_matrix
|
||||
ec_encode_data_base(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Choose random buffers to be in erasure
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list, src_in_err,
|
||||
nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Pick a first test
|
||||
m = 9;
|
||||
k = 5;
|
||||
if (m > MMAX || k > KMAX)
|
||||
return -1;
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Generate g_tbls from encode matrix encode_matrix
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix encode_matrix
|
||||
ec_encode_data_base(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Choose random buffers to be in erasure
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list, src_in_err,
|
||||
nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Do more random tests
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
ec_encode_data_base(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data_base(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(buffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
k = 16;
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
if (k > KMAX)
|
||||
return -1;
|
||||
|
||||
for (rows = 1; rows <= 16; rows++) {
|
||||
m = k + rows;
|
||||
if (m > MMAX)
|
||||
return -1;
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (size = EFENCE_TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
|
||||
for (i = 0; i < m; i++) { // Line up TEST_SIZE from end
|
||||
efence_buffs[i] = buffs[i] + TEST_LEN - size;
|
||||
}
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
ec_encode_data_base(size, k, m - k, g_tbls, efence_buffs,
|
||||
&efence_buffs[k]);
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = efence_buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 !=
|
||||
memcmp(temp_buffs[k + i], efence_buffs[src_err_list[i]],
|
||||
size)) {
|
||||
printf("Efence: Fail error recovery (%d, %d, %d)\n", m,
|
||||
k, nerrs);
|
||||
|
||||
printf("size = %d\n", size);
|
||||
|
||||
printf("Test erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], align);
|
||||
printf("orig :");
|
||||
dump(efence_buffs[src_err_list[i]], align);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Test rand ptr alignment if available
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
|
||||
|
||||
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
|
||||
// Add random offsets
|
||||
for (i = 0; i < m; i++) {
|
||||
memset(buffs[i], 0, TEST_LEN); // zero pad to check write-over
|
||||
memset(temp_buffs[i], 0, TEST_LEN); // zero pad to check write-over
|
||||
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
temp_ubuffs[i] = temp_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
}
|
||||
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
ubuffs[i][j] = rand();
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
ec_encode_data_base(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]);
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = ubuffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_ubuffs[k]);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_ubuffs[k + i], ubuffs[src_err_list[i]], size)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((unsigned char *)invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)decode_matrix, m, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(ubuffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(ubuffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_ubuffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Confirm that padding around dests is unchanged
|
||||
memset(temp_buffs[0], 0, PTR_ALIGN_CHK_B); // Make reference zero buff
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
offset = ubuffs[i] - buffs[i];
|
||||
|
||||
if (memcmp(buffs[i], temp_buffs[0], offset)) {
|
||||
printf("Fail rand ualign encode pad start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp
|
||||
(buffs[i] + offset + size, temp_buffs[0],
|
||||
PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign encode pad end\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
offset = temp_ubuffs[k + i] - temp_buffs[k + i];
|
||||
if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
|
||||
printf("Fail rand ualign decode pad start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp
|
||||
(temp_buffs[k + i] + offset + size, temp_buffs[0],
|
||||
PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign decode pad end\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test size alignment
|
||||
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 13 : 16;
|
||||
|
||||
for (size = TEST_LEN; size > 0; size -= align) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
ec_encode_data_base(size, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data_base(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], size)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((unsigned char *)invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)decode_matrix, m, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(buffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("done EC tests: Pass\n");
|
||||
return 0;
|
||||
}
|
168
erasure_code/erasure_code_perf.c
Normal file
168
erasure_code/erasure_code_perf.c
Normal file
@ -0,0 +1,168 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 32
|
||||
# define TEST_LEN(m) ((128*1024 / m) & ~(64-1))
|
||||
# define TEST_LOOPS(m) (10000*m)
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 32
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
|
||||
# define TEST_LOOPS(m) (50*m)
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS(m) 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define MMAX TEST_SOURCES
|
||||
#define KMAX TEST_SOURCES
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, m, k, nerrs, r;
|
||||
void *buf;
|
||||
u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
|
||||
u8 a[MMAX * KMAX], b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX];
|
||||
u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
|
||||
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
|
||||
struct perf start, stop;
|
||||
|
||||
// Pick test parameters
|
||||
m = 14;
|
||||
k = 10;
|
||||
nerrs = 4;
|
||||
const u8 err_list[] = { 2, 4, 5, 7 };
|
||||
|
||||
printf("erasure_code_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
|
||||
|
||||
if (m > MMAX || k > KMAX || nerrs > (m - k)) {
|
||||
printf(" Input test parameter error\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
memcpy(src_err_list, err_list, nerrs);
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
for (i = 0; i < nerrs; i++)
|
||||
src_in_err[src_err_list[i]] = 1;
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < m; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
|
||||
printf("alloc error: Fail\n");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
for (i = 0; i < (m - k); i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
|
||||
printf("alloc error: Fail\n");
|
||||
return -1;
|
||||
}
|
||||
temp_buffs[i] = buf;
|
||||
}
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN(m); j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
gf_gen_rs_matrix(a, m, k);
|
||||
ec_init_tables(k, m - k, &a[k * k], g_tbls);
|
||||
ec_encode_data(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Start encode test
|
||||
perf_start(&start);
|
||||
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
|
||||
// Make parity vects
|
||||
ec_init_tables(k, m - k, &a[k * k], g_tbls);
|
||||
ec_encode_data(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("erasure_code_encode" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
|
||||
|
||||
// Start decode test
|
||||
perf_start(&start);
|
||||
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
|
||||
// Construct b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r])
|
||||
r++;
|
||||
recov[i] = buffs[r];
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * i + j] = a[k * r + j];
|
||||
}
|
||||
|
||||
if (gf_invert_matrix(b, d, k) < 0) {
|
||||
printf("BAD MATRIX\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (i = 0; i < nerrs; i++)
|
||||
for (j = 0; j < k; j++)
|
||||
c[k * i + j] = d[k * src_err_list[i] + j];
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, c, g_tbls);
|
||||
ec_encode_data(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
if (0 != memcmp(temp_buffs[i], buffs[src_err_list[i]], TEST_LEN(m))) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("erasure_code_decode" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)(TEST_LEN(m)) * (k + nerrs) * rtest);
|
||||
|
||||
printf("done all: Pass\n");
|
||||
return 0;
|
||||
}
|
168
erasure_code/erasure_code_sse_perf.c
Normal file
168
erasure_code/erasure_code_sse_perf.c
Normal file
@ -0,0 +1,168 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 32
|
||||
# define TEST_LEN(m) ((128*1024 / m) & ~(64-1))
|
||||
# define TEST_LOOPS(m) (10000*m)
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 32
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
|
||||
# define TEST_LOOPS(m) (50*m)
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS(m) 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define MMAX TEST_SOURCES
|
||||
#define KMAX TEST_SOURCES
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, m, k, nerrs, r;
|
||||
void *buf;
|
||||
u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
|
||||
u8 a[MMAX * KMAX], b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX];
|
||||
u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
|
||||
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
|
||||
struct perf start, stop;
|
||||
|
||||
// Pick test parameters
|
||||
m = 14;
|
||||
k = 10;
|
||||
nerrs = 4;
|
||||
const u8 err_list[] = { 2, 4, 5, 7 };
|
||||
|
||||
printf("erasure_code_sse_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
|
||||
|
||||
if (m > MMAX || k > KMAX || nerrs > (m - k)) {
|
||||
printf(" Input test parameter error\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
memcpy(src_err_list, err_list, nerrs);
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
for (i = 0; i < nerrs; i++)
|
||||
src_in_err[src_err_list[i]] = 1;
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < m; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
|
||||
printf("alloc error: Fail\n");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
for (i = 0; i < (m - k); i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
|
||||
printf("alloc error: Fail\n");
|
||||
return -1;
|
||||
}
|
||||
temp_buffs[i] = buf;
|
||||
}
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN(m); j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
gf_gen_rs_matrix(a, m, k);
|
||||
ec_init_tables(k, m - k, &a[k * k], g_tbls);
|
||||
ec_encode_data_sse(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Start encode test
|
||||
perf_start(&start);
|
||||
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
|
||||
// Make parity vects
|
||||
ec_init_tables(k, m - k, &a[k * k], g_tbls);
|
||||
ec_encode_data_sse(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("erasure_code_sse_encode" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
|
||||
|
||||
// Start decode test
|
||||
perf_start(&start);
|
||||
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
|
||||
// Construct b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r])
|
||||
r++;
|
||||
recov[i] = buffs[r];
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * i + j] = a[k * r + j];
|
||||
}
|
||||
|
||||
if (gf_invert_matrix(b, d, k) < 0) {
|
||||
printf("BAD MATRIX\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (i = 0; i < nerrs; i++)
|
||||
for (j = 0; j < k; j++)
|
||||
c[k * i + j] = d[k * src_err_list[i] + j];
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, c, g_tbls);
|
||||
ec_encode_data_sse(TEST_LEN(m), k, nerrs, g_tbls, recov, temp_buffs);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
if (0 != memcmp(temp_buffs[i], buffs[src_err_list[i]], TEST_LEN(m))) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("erasure_code_sse_decode" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)(TEST_LEN(m)) * (k + nerrs) * rtest);
|
||||
|
||||
printf("done all: Pass\n");
|
||||
return 0;
|
||||
}
|
764
erasure_code/erasure_code_sse_test.c
Normal file
764
erasure_code/erasure_code_sse_test.c
Normal file
@ -0,0 +1,764 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 127
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 200
|
||||
#endif
|
||||
|
||||
#define MMAX TEST_SOURCES
|
||||
#define KMAX TEST_SOURCES
|
||||
|
||||
#define EFENCE_TEST_MIN_SIZE 16
|
||||
|
||||
#ifdef EC_ALIGNED_ADDR
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 0
|
||||
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
|
||||
#else
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 32
|
||||
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
|
||||
#endif
|
||||
|
||||
#ifndef TEST_SEED
|
||||
#define TEST_SEED 11
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Generate Random errors
|
||||
static void gen_err_list(unsigned char *src_err_list,
|
||||
unsigned char *src_in_err, int *pnerrs, int *pnsrcerrs, int k, int m)
|
||||
{
|
||||
int i, err;
|
||||
int nerrs = 0, nsrcerrs = 0;
|
||||
|
||||
for (i = 0, nerrs = 0, nsrcerrs = 0; i < m && nerrs < m - k; i++) {
|
||||
err = 1 & rand();
|
||||
src_in_err[i] = err;
|
||||
if (err) {
|
||||
src_err_list[nerrs++] = i;
|
||||
if (i < k) {
|
||||
nsrcerrs++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nerrs == 0) { // should have at least one error
|
||||
while ((err = (rand() % KMAX)) >= m) ;
|
||||
src_err_list[nerrs++] = err;
|
||||
src_in_err[err] = 1;
|
||||
if (err < k)
|
||||
nsrcerrs = 1;
|
||||
}
|
||||
*pnerrs = nerrs;
|
||||
*pnsrcerrs = nsrcerrs;
|
||||
return;
|
||||
}
|
||||
|
||||
#define NO_INVERT_MATRIX -2
|
||||
// Generate decode matrix from encode matrix
|
||||
static int gf_gen_decode_matrix(unsigned char *encode_matrix,
|
||||
unsigned char *decode_matrix,
|
||||
unsigned char *invert_matrix,
|
||||
unsigned int *decode_index,
|
||||
unsigned char *src_err_list,
|
||||
unsigned char *src_in_err,
|
||||
int nerrs, int nsrcerrs, int k, int m)
|
||||
{
|
||||
int i, j, p;
|
||||
int r;
|
||||
unsigned char *backup, *b, s;
|
||||
int incr = 0;
|
||||
|
||||
b = malloc(MMAX * KMAX);
|
||||
backup = malloc(MMAX * KMAX);
|
||||
|
||||
if (b == NULL || backup == NULL) {
|
||||
printf("Test failure! Error with malloc\n");
|
||||
free(b);
|
||||
free(backup);
|
||||
return -1;
|
||||
}
|
||||
// Construct matrix b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r])
|
||||
r++;
|
||||
for (j = 0; j < k; j++) {
|
||||
b[k * i + j] = encode_matrix[k * r + j];
|
||||
backup[k * i + j] = encode_matrix[k * r + j];
|
||||
}
|
||||
decode_index[i] = r;
|
||||
}
|
||||
incr = 0;
|
||||
while (gf_invert_matrix(b, invert_matrix, k) < 0) {
|
||||
if (nerrs == (m - k)) {
|
||||
free(b);
|
||||
free(backup);
|
||||
printf("BAD MATRIX\n");
|
||||
return NO_INVERT_MATRIX;
|
||||
}
|
||||
incr++;
|
||||
memcpy(b, backup, MMAX * KMAX);
|
||||
for (i = nsrcerrs; i < nerrs - nsrcerrs; i++) {
|
||||
if (src_err_list[i] == (decode_index[k - 1] + incr)) {
|
||||
// skip the erased parity line
|
||||
incr++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (decode_index[k - 1] + incr >= m) {
|
||||
free(b);
|
||||
free(backup);
|
||||
printf("BAD MATRIX\n");
|
||||
return NO_INVERT_MATRIX;
|
||||
}
|
||||
decode_index[k - 1] += incr;
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * (k - 1) + j] = encode_matrix[k * decode_index[k - 1] + j];
|
||||
|
||||
};
|
||||
|
||||
for (i = 0; i < nsrcerrs; i++) {
|
||||
for (j = 0; j < k; j++) {
|
||||
decode_matrix[k * i + j] = invert_matrix[k * src_err_list[i] + j];
|
||||
}
|
||||
}
|
||||
/* src_err_list from encode_matrix * invert of b for parity decoding */
|
||||
for (p = nsrcerrs; p < nerrs; p++) {
|
||||
for (i = 0; i < k; i++) {
|
||||
s = 0;
|
||||
for (j = 0; j < k; j++)
|
||||
s ^= gf_mul(invert_matrix[j * k + i],
|
||||
encode_matrix[k * src_err_list[p] + j]);
|
||||
|
||||
decode_matrix[k * p + i] = s;
|
||||
}
|
||||
}
|
||||
free(b);
|
||||
free(backup);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int re = 0;
|
||||
int i, j, p, rtest, m, k;
|
||||
int nerrs, nsrcerrs;
|
||||
void *buf;
|
||||
unsigned int decode_index[MMAX];
|
||||
unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
|
||||
unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
|
||||
unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
|
||||
unsigned char *recov[TEST_SOURCES];
|
||||
|
||||
int rows, align, size;
|
||||
unsigned char *efence_buffs[TEST_SOURCES];
|
||||
unsigned int offset;
|
||||
u8 *ubuffs[TEST_SOURCES];
|
||||
u8 *temp_ubuffs[TEST_SOURCES];
|
||||
|
||||
printf("erasure_code_sse_test: %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
srand(TEST_SEED);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
temp_buffs[i] = buf;
|
||||
}
|
||||
|
||||
// Test erasure code by encode and recovery
|
||||
|
||||
encode_matrix = malloc(MMAX * KMAX);
|
||||
decode_matrix = malloc(MMAX * KMAX);
|
||||
invert_matrix = malloc(MMAX * KMAX);
|
||||
g_tbls = malloc(KMAX * TEST_SOURCES * 32);
|
||||
if (encode_matrix == NULL || decode_matrix == NULL
|
||||
|| invert_matrix == NULL || g_tbls == NULL) {
|
||||
printf("Test failure! Error with malloc\n");
|
||||
return -1;
|
||||
}
|
||||
// Pick a first test
|
||||
m = 9;
|
||||
k = 5;
|
||||
if (m > MMAX || k > KMAX)
|
||||
return -1;
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// Generate encode matrix encode_matrix
|
||||
// The matrix generated by gf_gen_rs_matrix
|
||||
// is not always invertable.
|
||||
gf_gen_rs_matrix(encode_matrix, m, k);
|
||||
|
||||
// Generate g_tbls from encode matrix encode_matrix
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix encode_matrix
|
||||
ec_encode_data_sse(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Choose random buffers to be in erasure
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list, src_in_err,
|
||||
nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data_sse(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Pick a first test
|
||||
m = 9;
|
||||
k = 5;
|
||||
if (m > MMAX || k > KMAX)
|
||||
return -1;
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Generate g_tbls from encode matrix encode_matrix
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix encode_matrix
|
||||
ec_encode_data_sse(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Choose random buffers to be in erasure
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list, src_in_err,
|
||||
nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data_sse(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Do more random tests
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
ec_encode_data_sse(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data_sse(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(buffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
k = 16;
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
if (k > KMAX)
|
||||
return -1;
|
||||
|
||||
for (rows = 1; rows <= 16; rows++) {
|
||||
m = k + rows;
|
||||
if (m > MMAX)
|
||||
return -1;
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (size = EFENCE_TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
|
||||
for (i = 0; i < m; i++) { // Line up TEST_SIZE from end
|
||||
efence_buffs[i] = buffs[i] + TEST_LEN - size;
|
||||
}
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
ec_encode_data_sse(size, k, m - k, g_tbls, efence_buffs,
|
||||
&efence_buffs[k]);
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = efence_buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data_sse(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 !=
|
||||
memcmp(temp_buffs[k + i], efence_buffs[src_err_list[i]],
|
||||
size)) {
|
||||
printf("Efence: Fail error recovery (%d, %d, %d)\n", m,
|
||||
k, nerrs);
|
||||
|
||||
printf("size = %d\n", size);
|
||||
|
||||
printf("Test erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], align);
|
||||
printf("orig :");
|
||||
dump(efence_buffs[src_err_list[i]], align);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Test rand ptr alignment if available
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
|
||||
|
||||
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
|
||||
// Add random offsets
|
||||
for (i = 0; i < m; i++) {
|
||||
memset(buffs[i], 0, TEST_LEN); // zero pad to check write-over
|
||||
memset(temp_buffs[i], 0, TEST_LEN); // zero pad to check write-over
|
||||
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
temp_ubuffs[i] = temp_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
}
|
||||
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
ubuffs[i][j] = rand();
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
ec_encode_data_sse(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]);
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = ubuffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data_sse(size, k, nerrs, g_tbls, recov, &temp_ubuffs[k]);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_ubuffs[k + i], ubuffs[src_err_list[i]], size)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((unsigned char *)invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)decode_matrix, m, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(ubuffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(ubuffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_ubuffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Confirm that padding around dests is unchanged
|
||||
memset(temp_buffs[0], 0, PTR_ALIGN_CHK_B); // Make reference zero buff
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
offset = ubuffs[i] - buffs[i];
|
||||
|
||||
if (memcmp(buffs[i], temp_buffs[0], offset)) {
|
||||
printf("Fail rand ualign encode pad start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp
|
||||
(buffs[i] + offset + size, temp_buffs[0],
|
||||
PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign encode pad end\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
offset = temp_ubuffs[k + i] - temp_buffs[k + i];
|
||||
if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
|
||||
printf("Fail rand ualign decode pad start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp
|
||||
(temp_buffs[k + i] + offset + size, temp_buffs[0],
|
||||
PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign decode pad end\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test size alignment
|
||||
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 13 : 16;
|
||||
|
||||
for (size = TEST_LEN; size > 0; size -= align) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
ec_encode_data_sse(size, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data_sse(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], size)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((unsigned char *)invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)decode_matrix, m, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(buffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("done EC tests: Pass\n");
|
||||
return 0;
|
||||
}
|
763
erasure_code/erasure_code_test.c
Normal file
763
erasure_code/erasure_code_test.c
Normal file
@ -0,0 +1,763 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 127
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 200
|
||||
#endif
|
||||
|
||||
#define MMAX TEST_SOURCES
|
||||
#define KMAX TEST_SOURCES
|
||||
|
||||
#define EFENCE_TEST_MIN_SIZE 16
|
||||
|
||||
#ifdef EC_ALIGNED_ADDR
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 0
|
||||
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
|
||||
#else
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 32
|
||||
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
|
||||
#endif
|
||||
|
||||
#ifndef TEST_SEED
|
||||
#define TEST_SEED 11
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Generate Random errors
|
||||
static void gen_err_list(unsigned char *src_err_list,
|
||||
unsigned char *src_in_err, int *pnerrs, int *pnsrcerrs, int k, int m)
|
||||
{
|
||||
int i, err;
|
||||
int nerrs = 0, nsrcerrs = 0;
|
||||
|
||||
for (i = 0, nerrs = 0, nsrcerrs = 0; i < m && nerrs < m - k; i++) {
|
||||
err = 1 & rand();
|
||||
src_in_err[i] = err;
|
||||
if (err) {
|
||||
src_err_list[nerrs++] = i;
|
||||
if (i < k) {
|
||||
nsrcerrs++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nerrs == 0) { // should have at least one error
|
||||
while ((err = (rand() % KMAX)) >= m) ;
|
||||
src_err_list[nerrs++] = err;
|
||||
src_in_err[err] = 1;
|
||||
if (err < k)
|
||||
nsrcerrs = 1;
|
||||
}
|
||||
*pnerrs = nerrs;
|
||||
*pnsrcerrs = nsrcerrs;
|
||||
return;
|
||||
}
|
||||
|
||||
#define NO_INVERT_MATRIX -2
|
||||
// Generate decode matrix from encode matrix
|
||||
static int gf_gen_decode_matrix(unsigned char *encode_matrix,
|
||||
unsigned char *decode_matrix,
|
||||
unsigned char *invert_matrix,
|
||||
unsigned int *decode_index,
|
||||
unsigned char *src_err_list,
|
||||
unsigned char *src_in_err,
|
||||
int nerrs, int nsrcerrs, int k, int m)
|
||||
{
|
||||
int i, j, p;
|
||||
int r;
|
||||
unsigned char *backup, *b, s;
|
||||
int incr = 0;
|
||||
|
||||
b = malloc(MMAX * KMAX);
|
||||
backup = malloc(MMAX * KMAX);
|
||||
|
||||
if (b == NULL || backup == NULL) {
|
||||
printf("Test failure! Error with malloc\n");
|
||||
free(b);
|
||||
free(backup);
|
||||
return -1;
|
||||
}
|
||||
// Construct matrix b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r])
|
||||
r++;
|
||||
for (j = 0; j < k; j++) {
|
||||
b[k * i + j] = encode_matrix[k * r + j];
|
||||
backup[k * i + j] = encode_matrix[k * r + j];
|
||||
}
|
||||
decode_index[i] = r;
|
||||
}
|
||||
incr = 0;
|
||||
while (gf_invert_matrix(b, invert_matrix, k) < 0) {
|
||||
if (nerrs == (m - k)) {
|
||||
free(b);
|
||||
free(backup);
|
||||
printf("BAD MATRIX\n");
|
||||
return NO_INVERT_MATRIX;
|
||||
}
|
||||
incr++;
|
||||
memcpy(b, backup, MMAX * KMAX);
|
||||
for (i = nsrcerrs; i < nerrs - nsrcerrs; i++) {
|
||||
if (src_err_list[i] == (decode_index[k - 1] + incr)) {
|
||||
// skip the erased parity line
|
||||
incr++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (decode_index[k - 1] + incr >= m) {
|
||||
free(b);
|
||||
free(backup);
|
||||
printf("BAD MATRIX\n");
|
||||
return NO_INVERT_MATRIX;
|
||||
}
|
||||
decode_index[k - 1] += incr;
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * (k - 1) + j] = encode_matrix[k * decode_index[k - 1] + j];
|
||||
|
||||
};
|
||||
|
||||
for (i = 0; i < nsrcerrs; i++) {
|
||||
for (j = 0; j < k; j++) {
|
||||
decode_matrix[k * i + j] = invert_matrix[k * src_err_list[i] + j];
|
||||
}
|
||||
}
|
||||
/* src_err_list from encode_matrix * invert of b for parity decoding */
|
||||
for (p = nsrcerrs; p < nerrs; p++) {
|
||||
for (i = 0; i < k; i++) {
|
||||
s = 0;
|
||||
for (j = 0; j < k; j++)
|
||||
s ^= gf_mul(invert_matrix[j * k + i],
|
||||
encode_matrix[k * src_err_list[p] + j]);
|
||||
|
||||
decode_matrix[k * p + i] = s;
|
||||
}
|
||||
}
|
||||
free(b);
|
||||
free(backup);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int re = 0;
|
||||
int i, j, p, rtest, m, k;
|
||||
int nerrs, nsrcerrs;
|
||||
void *buf;
|
||||
unsigned int decode_index[MMAX];
|
||||
unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
|
||||
unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
|
||||
unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
|
||||
unsigned char *recov[TEST_SOURCES];
|
||||
|
||||
int rows, align, size;
|
||||
unsigned char *efence_buffs[TEST_SOURCES];
|
||||
unsigned int offset;
|
||||
u8 *ubuffs[TEST_SOURCES];
|
||||
u8 *temp_ubuffs[TEST_SOURCES];
|
||||
|
||||
printf("erasure_code_test: %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
srand(TEST_SEED);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
temp_buffs[i] = buf;
|
||||
}
|
||||
|
||||
// Test erasure code by encode and recovery
|
||||
|
||||
encode_matrix = malloc(MMAX * KMAX);
|
||||
decode_matrix = malloc(MMAX * KMAX);
|
||||
invert_matrix = malloc(MMAX * KMAX);
|
||||
g_tbls = malloc(KMAX * TEST_SOURCES * 32);
|
||||
if (encode_matrix == NULL || decode_matrix == NULL
|
||||
|| invert_matrix == NULL || g_tbls == NULL) {
|
||||
printf("Test failure! Error with malloc\n");
|
||||
return -1;
|
||||
}
|
||||
// Pick a first test
|
||||
m = 9;
|
||||
k = 5;
|
||||
if (m > MMAX || k > KMAX)
|
||||
return -1;
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// Generate encode matrix encode_matrix
|
||||
// The matrix generated by gf_gen_rs_matrix
|
||||
// is not always invertable.
|
||||
gf_gen_rs_matrix(encode_matrix, m, k);
|
||||
|
||||
// Generate g_tbls from encode matrix encode_matrix
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix encode_matrix
|
||||
ec_encode_data(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Choose random buffers to be in erasure
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list, src_in_err,
|
||||
nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Pick a first test
|
||||
m = 9;
|
||||
k = 5;
|
||||
if (m > MMAX || k > KMAX)
|
||||
return -1;
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Generate g_tbls from encode matrix encode_matrix
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix encode_matrix
|
||||
ec_encode_data(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Choose random buffers to be in erasure
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list, src_in_err,
|
||||
nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Do more random tests
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
ec_encode_data(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(buffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
k = 16;
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
if (k > KMAX)
|
||||
return -1;
|
||||
|
||||
for (rows = 1; rows <= 16; rows++) {
|
||||
m = k + rows;
|
||||
if (m > MMAX)
|
||||
return -1;
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (size = EFENCE_TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
|
||||
for (i = 0; i < m; i++) { // Line up TEST_SIZE from end
|
||||
efence_buffs[i] = buffs[i] + TEST_LEN - size;
|
||||
}
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
ec_encode_data(size, k, m - k, g_tbls, efence_buffs, &efence_buffs[k]);
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = efence_buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 !=
|
||||
memcmp(temp_buffs[k + i], efence_buffs[src_err_list[i]],
|
||||
size)) {
|
||||
printf("Efence: Fail error recovery (%d, %d, %d)\n", m,
|
||||
k, nerrs);
|
||||
|
||||
printf("size = %d\n", size);
|
||||
|
||||
printf("Test erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], align);
|
||||
printf("orig :");
|
||||
dump(efence_buffs[src_err_list[i]], align);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Test rand ptr alignment if available
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
|
||||
|
||||
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
|
||||
// Add random offsets
|
||||
for (i = 0; i < m; i++) {
|
||||
memset(buffs[i], 0, TEST_LEN); // zero pad to check write-over
|
||||
memset(temp_buffs[i], 0, TEST_LEN); // zero pad to check write-over
|
||||
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
temp_ubuffs[i] = temp_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
}
|
||||
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
ubuffs[i][j] = rand();
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
ec_encode_data(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]);
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = ubuffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data(size, k, nerrs, g_tbls, recov, &temp_ubuffs[k]);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_ubuffs[k + i], ubuffs[src_err_list[i]], size)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((unsigned char *)invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)decode_matrix, m, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(ubuffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(ubuffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_ubuffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Confirm that padding around dests is unchanged
|
||||
memset(temp_buffs[0], 0, PTR_ALIGN_CHK_B); // Make reference zero buff
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
offset = ubuffs[i] - buffs[i];
|
||||
|
||||
if (memcmp(buffs[i], temp_buffs[0], offset)) {
|
||||
printf("Fail rand ualign encode pad start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp
|
||||
(buffs[i] + offset + size, temp_buffs[0],
|
||||
PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign encode pad end\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
offset = temp_ubuffs[k + i] - temp_buffs[k + i];
|
||||
if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
|
||||
printf("Fail rand ualign decode pad start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp
|
||||
(temp_buffs[k + i] + offset + size, temp_buffs[0],
|
||||
PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign decode pad end\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test size alignment
|
||||
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 13 : 16;
|
||||
|
||||
for (size = TEST_LEN; size > 0; size -= align) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
ec_encode_data(size, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
ec_encode_data(size, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], buffs[src_err_list[i]], size)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((unsigned char *)invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)decode_matrix, m, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(buffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("done EC tests: Pass\n");
|
||||
return 0;
|
||||
}
|
306
erasure_code/erasure_code_update_perf.c
Normal file
306
erasure_code/erasure_code_update_perf.c
Normal file
@ -0,0 +1,306 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
#include "test.h"
|
||||
|
||||
//By default, test multibinary version
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST ec_encode_data_update
|
||||
# define REF_FUNCTION ec_encode_data
|
||||
#endif
|
||||
|
||||
//By default, test EC(8+4)
|
||||
#if (!defined(VECT))
|
||||
# define VECT 4
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 32
|
||||
# define TEST_LEN(m) ((128*1024 / m) & ~(64-1))
|
||||
# define TEST_LOOPS(m) (10000*m)
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 32
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN(m) ((GT_L3_CACHE / m) & ~(64-1))
|
||||
# define TEST_LOOPS(m) (50*m)
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS(m) 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define MMAX TEST_SOURCES
|
||||
#define KMAX TEST_SOURCES
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, m, k, nerrs, r;
|
||||
void *buf;
|
||||
u8 *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
|
||||
u8 *update_buffs[TEST_SOURCES];
|
||||
u8 *perf_update_buffs[TEST_SOURCES];
|
||||
u8 a[MMAX * KMAX], b[MMAX * KMAX], c[MMAX * KMAX], d[MMAX * KMAX];
|
||||
u8 g_tbls[KMAX * TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
|
||||
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
|
||||
struct perf start, stop;
|
||||
|
||||
// Pick test parameters
|
||||
k = 10;
|
||||
m = k + VECT;
|
||||
nerrs = VECT;
|
||||
const u8 err_list[] = { 0, 2, 4, 5, 7, 8 };
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) "_perf: %dx%d %d\n", m, TEST_LEN(m), nerrs);
|
||||
|
||||
if (m > MMAX || k > KMAX || nerrs > (m - k)) {
|
||||
printf(" Input test parameter error\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
memcpy(src_err_list, err_list, nerrs);
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
for (i = 0; i < nerrs; i++)
|
||||
src_in_err[src_err_list[i]] = 1;
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < m; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
|
||||
printf("alloc error: Fail\n");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
for (i = 0; i < (m - k); i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
|
||||
printf("alloc error: Fail\n");
|
||||
return -1;
|
||||
}
|
||||
temp_buffs[i] = buf;
|
||||
memset(temp_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
update_buffs[i] = buf;
|
||||
memset(update_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function
|
||||
}
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN(m))) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
perf_update_buffs[i] = buf;
|
||||
memset(perf_update_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function
|
||||
}
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN(m); j++) {
|
||||
buffs[i][j] = rand();
|
||||
update_buffs[i][j] = buffs[i][j];
|
||||
}
|
||||
|
||||
gf_gen_rs_matrix(a, m, k);
|
||||
ec_init_tables(k, m - k, &a[k * k], g_tbls);
|
||||
REF_FUNCTION(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, i, g_tbls, update_buffs[i],
|
||||
&update_buffs[k]);
|
||||
}
|
||||
for (i = 0; i < m - k; i++) {
|
||||
if (0 != memcmp(update_buffs[k + i], buffs[k + i], TEST_LEN(m))) {
|
||||
printf("\nupdate_buffs%d :", i);
|
||||
dump(update_buffs[k + i], 25);
|
||||
printf("buffs%d :", i);
|
||||
dump(buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DO_REF_PERF
|
||||
REF_FUNCTION(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
// Start encode test
|
||||
perf_start(&start);
|
||||
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
|
||||
// Make parity vects
|
||||
ec_init_tables(k, m - k, &a[k * k], g_tbls);
|
||||
REF_FUNCTION(TEST_LEN(m), k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(REF_FUNCTION) TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
|
||||
#endif
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, i, g_tbls, perf_update_buffs[i],
|
||||
&perf_update_buffs[k]);
|
||||
}
|
||||
// Start encode test
|
||||
perf_start(&start);
|
||||
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
|
||||
// Make parity vects
|
||||
ec_init_tables(k, m - k, &a[k * k], g_tbls);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, i, g_tbls,
|
||||
perf_update_buffs[i], &perf_update_buffs[k]);
|
||||
}
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m) * rtest);
|
||||
|
||||
// Start encode test
|
||||
perf_start(&start);
|
||||
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
|
||||
// Make parity vects
|
||||
ec_init_tables(k, m - k, &a[k * k], g_tbls);
|
||||
FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, 0, g_tbls, perf_update_buffs[0],
|
||||
&perf_update_buffs[k]);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) "_single_src" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m - k + 1) * rtest);
|
||||
|
||||
// Start encode test
|
||||
perf_start(&start);
|
||||
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
|
||||
// Make parity vects
|
||||
FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, 0, g_tbls, perf_update_buffs[0],
|
||||
&perf_update_buffs[k]);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) "_single_src_simple" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)(TEST_LEN(m)) * (m - k + 1) * rtest);
|
||||
|
||||
for (i = k; i < m; i++) {
|
||||
memset(update_buffs[i], 0, TEST_LEN(m)); // initialize the destination buffer to be zero for update function
|
||||
}
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(TEST_LEN(m), k, m - k, i, g_tbls, update_buffs[i],
|
||||
&update_buffs[k]);
|
||||
}
|
||||
// Construct b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r])
|
||||
r++;
|
||||
recov[i] = update_buffs[r];
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * i + j] = a[k * r + j];
|
||||
}
|
||||
|
||||
if (gf_invert_matrix(b, d, k) < 0) {
|
||||
printf("BAD MATRIX\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (i = 0; i < nerrs; i++)
|
||||
for (j = 0; j < k; j++)
|
||||
c[k * i + j] = d[k * src_err_list[i] + j];
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, c, g_tbls);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(TEST_LEN(m), k, nerrs, i, g_tbls, recov[i], temp_buffs);
|
||||
}
|
||||
// Start decode test
|
||||
perf_start(&start);
|
||||
for (rtest = 0; rtest < TEST_LOOPS(m); rtest++) {
|
||||
// Construct b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r])
|
||||
r++;
|
||||
recov[i] = update_buffs[r];
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * i + j] = a[k * r + j];
|
||||
}
|
||||
|
||||
if (gf_invert_matrix(b, d, k) < 0) {
|
||||
printf("BAD MATRIX\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (i = 0; i < nerrs; i++)
|
||||
for (j = 0; j < k; j++)
|
||||
c[k * i + j] = d[k * src_err_list[i] + j];
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, c, g_tbls);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(TEST_LEN(m), k, nerrs, i, g_tbls, recov[i],
|
||||
perf_update_buffs);
|
||||
}
|
||||
}
|
||||
perf_stop(&stop);
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
if (0 != memcmp(temp_buffs[i], update_buffs[src_err_list[i]], TEST_LEN(m))) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) "_decode" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)(TEST_LEN(m)) * (k + nerrs) * rtest);
|
||||
|
||||
printf("done all: Pass\n");
|
||||
return 0;
|
||||
}
|
957
erasure_code/erasure_code_update_test.c
Normal file
957
erasure_code/erasure_code_update_test.c
Normal file
@ -0,0 +1,957 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#ifndef ALIGN_SIZE
|
||||
# define ALIGN_SIZE 16
|
||||
#endif
|
||||
|
||||
//By default, test multibinary version
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST ec_encode_data_update
|
||||
# define REF_FUNCTION ec_encode_data
|
||||
#endif
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 127
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 200
|
||||
#endif
|
||||
|
||||
#define MMAX TEST_SOURCES
|
||||
#define KMAX TEST_SOURCES
|
||||
|
||||
#ifdef EC_ALIGNED_ADDR
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 0
|
||||
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
|
||||
#else
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B ALIGN_SIZE
|
||||
# define LEN_ALIGN_CHK_B ALIGN_SIZE // 0 for aligned only
|
||||
#endif
|
||||
|
||||
#ifndef TEST_SEED
|
||||
#define TEST_SEED 11
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Generate Random errors
|
||||
static void gen_err_list(unsigned char *src_err_list,
|
||||
unsigned char *src_in_err, int *pnerrs, int *pnsrcerrs, int k, int m)
|
||||
{
|
||||
int i, err;
|
||||
int nerrs = 0, nsrcerrs = 0;
|
||||
|
||||
for (i = 0, nerrs = 0, nsrcerrs = 0; i < m && nerrs < m - k; i++) {
|
||||
err = 1 & rand();
|
||||
src_in_err[i] = err;
|
||||
if (err) {
|
||||
src_err_list[nerrs++] = i;
|
||||
if (i < k) {
|
||||
nsrcerrs++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nerrs == 0) { // should have at least one error
|
||||
while ((err = (rand() % KMAX)) >= m) ;
|
||||
src_err_list[nerrs++] = err;
|
||||
src_in_err[err] = 1;
|
||||
if (err < k)
|
||||
nsrcerrs = 1;
|
||||
}
|
||||
*pnerrs = nerrs;
|
||||
*pnsrcerrs = nsrcerrs;
|
||||
return;
|
||||
}
|
||||
|
||||
#define NO_INVERT_MATRIX -2
|
||||
// Generate decode matrix from encode matrix
|
||||
static int gf_gen_decode_matrix(unsigned char *encode_matrix,
|
||||
unsigned char *decode_matrix,
|
||||
unsigned char *invert_matrix,
|
||||
unsigned int *decode_index,
|
||||
unsigned char *src_err_list,
|
||||
unsigned char *src_in_err,
|
||||
int nerrs, int nsrcerrs, int k, int m)
|
||||
{
|
||||
int i, j, p;
|
||||
int r;
|
||||
unsigned char *backup, *b, s;
|
||||
int incr = 0;
|
||||
|
||||
b = malloc(MMAX * KMAX);
|
||||
backup = malloc(MMAX * KMAX);
|
||||
|
||||
if (b == NULL || backup == NULL) {
|
||||
printf("Test failure! Error with malloc\n");
|
||||
free(b);
|
||||
free(backup);
|
||||
return -1;
|
||||
}
|
||||
// Construct matrix b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r])
|
||||
r++;
|
||||
for (j = 0; j < k; j++) {
|
||||
b[k * i + j] = encode_matrix[k * r + j];
|
||||
backup[k * i + j] = encode_matrix[k * r + j];
|
||||
}
|
||||
decode_index[i] = r;
|
||||
}
|
||||
incr = 0;
|
||||
while (gf_invert_matrix(b, invert_matrix, k) < 0) {
|
||||
if (nerrs == (m - k)) {
|
||||
free(b);
|
||||
free(backup);
|
||||
printf("BAD MATRIX\n");
|
||||
return NO_INVERT_MATRIX;
|
||||
}
|
||||
incr++;
|
||||
memcpy(b, backup, MMAX * KMAX);
|
||||
for (i = nsrcerrs; i < nerrs - nsrcerrs; i++) {
|
||||
if (src_err_list[i] == (decode_index[k - 1] + incr)) {
|
||||
// skip the erased parity line
|
||||
incr++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (decode_index[k - 1] + incr >= m) {
|
||||
free(b);
|
||||
free(backup);
|
||||
printf("BAD MATRIX\n");
|
||||
return NO_INVERT_MATRIX;
|
||||
}
|
||||
decode_index[k - 1] += incr;
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * (k - 1) + j] = encode_matrix[k * decode_index[k - 1] + j];
|
||||
|
||||
};
|
||||
|
||||
for (i = 0; i < nsrcerrs; i++) {
|
||||
for (j = 0; j < k; j++) {
|
||||
decode_matrix[k * i + j] = invert_matrix[k * src_err_list[i] + j];
|
||||
}
|
||||
}
|
||||
/* src_err_list from encode_matrix * invert of b for parity decoding */
|
||||
for (p = nsrcerrs; p < nerrs; p++) {
|
||||
for (i = 0; i < k; i++) {
|
||||
s = 0;
|
||||
for (j = 0; j < k; j++)
|
||||
s ^= gf_mul(invert_matrix[j * k + i],
|
||||
encode_matrix[k * src_err_list[p] + j]);
|
||||
|
||||
decode_matrix[k * p + i] = s;
|
||||
}
|
||||
}
|
||||
free(b);
|
||||
free(backup);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int re = 0;
|
||||
int i, j, p, rtest, m, k;
|
||||
int nerrs, nsrcerrs;
|
||||
void *buf;
|
||||
unsigned int decode_index[MMAX];
|
||||
unsigned char *temp_buffs[TEST_SOURCES], *buffs[TEST_SOURCES];
|
||||
unsigned char *update_buffs[TEST_SOURCES];
|
||||
unsigned char *encode_matrix, *decode_matrix, *invert_matrix, *g_tbls;
|
||||
unsigned char src_in_err[TEST_SOURCES], src_err_list[TEST_SOURCES];
|
||||
unsigned char *recov[TEST_SOURCES];
|
||||
|
||||
int rows, align, size;
|
||||
unsigned char *efence_buffs[TEST_SOURCES];
|
||||
unsigned char *efence_update_buffs[TEST_SOURCES];
|
||||
unsigned int offset;
|
||||
u8 *ubuffs[TEST_SOURCES];
|
||||
u8 *update_ubuffs[TEST_SOURCES];
|
||||
u8 *temp_ubuffs[TEST_SOURCES];
|
||||
|
||||
printf("test " xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
srand(TEST_SEED);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
temp_buffs[i] = buf;
|
||||
memset(temp_buffs[i], 0, TEST_LEN); // initialize the destination buffer to be zero for update function
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
update_buffs[i] = buf;
|
||||
memset(update_buffs[i], 0, TEST_LEN); // initialize the destination buffer to be zero for update function
|
||||
}
|
||||
// Test erasure code by encode and recovery
|
||||
|
||||
encode_matrix = malloc(MMAX * KMAX);
|
||||
decode_matrix = malloc(MMAX * KMAX);
|
||||
invert_matrix = malloc(MMAX * KMAX);
|
||||
g_tbls = malloc(KMAX * TEST_SOURCES * 32);
|
||||
if (encode_matrix == NULL || decode_matrix == NULL
|
||||
|| invert_matrix == NULL || g_tbls == NULL) {
|
||||
printf("Test failure! Error with malloc\n");
|
||||
return -1;
|
||||
}
|
||||
// Pick a first test
|
||||
m = 15;
|
||||
k = 10;
|
||||
if (m > MMAX || k > KMAX)
|
||||
return -1;
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < TEST_LEN; j++) {
|
||||
buffs[i][j] = rand();
|
||||
update_buffs[i][j] = buffs[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
// Generate encode matrix encode_matrix
|
||||
// The matrix generated by gf_gen_rs_matrix
|
||||
// is not always invertable.
|
||||
gf_gen_rs_matrix(encode_matrix, m, k);
|
||||
|
||||
// Generate g_tbls from encode matrix encode_matrix
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix encode_matrix
|
||||
REF_FUNCTION(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, m - k, i, g_tbls, update_buffs[i],
|
||||
&update_buffs[k]);
|
||||
}
|
||||
for (i = 0; i < m - k; i++) {
|
||||
if (0 != memcmp(update_buffs[k + i], buffs[k + i], TEST_LEN)) {
|
||||
printf("\nupdate_buffs%d :", i);
|
||||
dump(update_buffs[k + i], 25);
|
||||
printf("buffs%d :", i);
|
||||
dump(buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Choose random buffers to be in erasure
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list, src_in_err,
|
||||
nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = update_buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
REF_FUNCTION(TEST_LEN, k, nerrs, g_tbls, recov, &temp_buffs[k]);
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], update_buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
printf("orig :");
|
||||
dump(update_buffs[src_err_list[i]], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
putchar('.');
|
||||
|
||||
// Pick a first test
|
||||
m = 7;
|
||||
k = 5;
|
||||
if (m > MMAX || k > KMAX)
|
||||
return -1;
|
||||
|
||||
// Zero the destination buffer for update function
|
||||
for (i = k; i < TEST_SOURCES; i++) {
|
||||
memset(buffs[i], 0, TEST_LEN);
|
||||
memset(update_buffs[i], 0, TEST_LEN);
|
||||
}
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < TEST_LEN; j++) {
|
||||
buffs[i][j] = rand();
|
||||
update_buffs[i][j] = buffs[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Generate g_tbls from encode matrix encode_matrix
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix encode_matrix
|
||||
REF_FUNCTION(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, m - k, i, g_tbls, update_buffs[i],
|
||||
&update_buffs[k]);
|
||||
}
|
||||
for (i = 0; i < m - k; i++) {
|
||||
if (0 != memcmp(update_buffs[k + i], buffs[k + i], TEST_LEN)) {
|
||||
printf("\nupdate_buffs%d :", i);
|
||||
dump(update_buffs[k + i], 25);
|
||||
printf("buffs%d :", i);
|
||||
dump(buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Choose random buffers to be in erasure
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list, src_in_err,
|
||||
nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = update_buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
memset(temp_buffs[i], 0, TEST_LEN);
|
||||
}
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, nerrs, i, g_tbls, recov[i], &temp_buffs[k]);
|
||||
}
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 != memcmp(temp_buffs[k + i], update_buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
printf("orig :");
|
||||
dump(update_buffs[src_err_list[i]], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
putchar('.');
|
||||
|
||||
// Do more random tests
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
// Zero the destination buffer for update function
|
||||
for (i = k; i < TEST_SOURCES; i++) {
|
||||
memset(buffs[i], 0, TEST_LEN);
|
||||
memset(update_buffs[i], 0, TEST_LEN);
|
||||
}
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < TEST_LEN; j++) {
|
||||
buffs[i][j] = rand();
|
||||
update_buffs[i][j] = buffs[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
REF_FUNCTION(TEST_LEN, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, m - k, i, g_tbls, update_buffs[i],
|
||||
&update_buffs[k]);
|
||||
}
|
||||
for (i = 0; i < m - k; i++) {
|
||||
if (0 != memcmp(update_buffs[k + i], buffs[k + i], TEST_LEN)) {
|
||||
printf("\nupdate_buffs%d :", i);
|
||||
dump(update_buffs[k + i], 25);
|
||||
printf("buffs%d :", i);
|
||||
dump(buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = update_buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
memset(temp_buffs[i], 0, TEST_LEN);
|
||||
}
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, nerrs, i, g_tbls, recov[i],
|
||||
&temp_buffs[k]);
|
||||
}
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 !=
|
||||
memcmp(temp_buffs[k + i], update_buffs[src_err_list[i]],
|
||||
TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(update_buffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(update_buffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
k = 16;
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : ALIGN_SIZE;
|
||||
if (k > KMAX)
|
||||
return -1;
|
||||
|
||||
for (rows = 1; rows <= 16; rows++) {
|
||||
m = k + rows;
|
||||
if (m > MMAX)
|
||||
return -1;
|
||||
|
||||
for (i = k; i < TEST_SOURCES; i++) {
|
||||
memset(buffs[i], 0, TEST_LEN);
|
||||
memset(update_buffs[i], 0, TEST_LEN);
|
||||
}
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < TEST_LEN; j++) {
|
||||
buffs[i][j] = rand();
|
||||
update_buffs[i][j] = buffs[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
for (size = 0; size <= TEST_SIZE; size += align) {
|
||||
for (i = 0; i < m; i++) { // Line up TEST_SIZE from end
|
||||
efence_buffs[i] = buffs[i] + TEST_LEN - size;
|
||||
efence_update_buffs[i] = update_buffs[i] + TEST_LEN - size;
|
||||
}
|
||||
// Zero the destination buffer for update function
|
||||
for (i = k; i < m; i++) {
|
||||
memset(efence_buffs[i], 0, size);
|
||||
memset(efence_update_buffs[i], 0, size);
|
||||
}
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
REF_FUNCTION(size, k, m - k, g_tbls, efence_buffs, &efence_buffs[k]);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(size, k, m - k, i, g_tbls,
|
||||
efence_update_buffs[i],
|
||||
&efence_update_buffs[k]);
|
||||
}
|
||||
for (i = 0; i < m - k; i++) {
|
||||
if (0 !=
|
||||
memcmp(efence_update_buffs[k + i], efence_buffs[k + i],
|
||||
size)) {
|
||||
printf("\nefence_update_buffs%d :", i);
|
||||
dump(efence_update_buffs[k + i], 25);
|
||||
printf("efence_buffs%d :", i);
|
||||
dump(efence_buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = efence_update_buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
memset(temp_buffs[i], 0, TEST_LEN);
|
||||
}
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(size, k, nerrs, i, g_tbls, recov[i],
|
||||
&temp_buffs[k]);
|
||||
}
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 !=
|
||||
memcmp(temp_buffs[k + i],
|
||||
efence_update_buffs[src_err_list[i]], size)) {
|
||||
printf("Efence: Fail error recovery (%d, %d, %d)\n", m,
|
||||
k, nerrs);
|
||||
|
||||
printf("size = %d\n", size);
|
||||
|
||||
printf("Test erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((u8 *) encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((u8 *) decode_matrix, m, k);
|
||||
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], align);
|
||||
printf("orig :");
|
||||
dump(efence_update_buffs[src_err_list[i]], align);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
putchar('.');
|
||||
|
||||
}
|
||||
|
||||
// Test rand ptr alignment if available
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~15;
|
||||
|
||||
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
|
||||
// Add random offsets
|
||||
for (i = 0; i < m; i++) {
|
||||
memset(buffs[i], 0, TEST_LEN); // zero pad to check write-over
|
||||
memset(update_buffs[i], 0, TEST_LEN); // zero pad to check write-over
|
||||
memset(temp_buffs[i], 0, TEST_LEN); // zero pad to check write-over
|
||||
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
update_ubuffs[i] =
|
||||
update_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
temp_ubuffs[i] = temp_buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
}
|
||||
|
||||
// Zero the destination buffer for update function
|
||||
for (i = k; i < m; i++) {
|
||||
memset(ubuffs[i], 0, size);
|
||||
memset(update_ubuffs[i], 0, size);
|
||||
}
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < size; j++) {
|
||||
ubuffs[i][j] = rand();
|
||||
update_ubuffs[i][j] = ubuffs[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
REF_FUNCTION(size, k, m - k, g_tbls, ubuffs, &ubuffs[k]);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(size, k, m - k, i, g_tbls, update_ubuffs[i],
|
||||
&update_ubuffs[k]);
|
||||
}
|
||||
for (i = 0; i < m - k; i++) {
|
||||
if (0 != memcmp(update_ubuffs[k + i], ubuffs[k + i], size)) {
|
||||
printf("\nupdate_ubuffs%d :", i);
|
||||
dump(update_ubuffs[k + i], 25);
|
||||
printf("ubuffs%d :", i);
|
||||
dump(ubuffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = update_ubuffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
for (i = 0; i < m; i++) {
|
||||
memset(temp_ubuffs[i], 0, size);
|
||||
}
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(size, k, nerrs, i, g_tbls, recov[i],
|
||||
&temp_ubuffs[k]);
|
||||
}
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 !=
|
||||
memcmp(temp_ubuffs[k + i], update_ubuffs[src_err_list[i]], size)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((unsigned char *)invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)decode_matrix, m, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(update_ubuffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(update_ubuffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_ubuffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Confirm that padding around dests is unchanged
|
||||
memset(temp_buffs[0], 0, PTR_ALIGN_CHK_B); // Make reference zero buff
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
offset = update_ubuffs[i] - update_buffs[i];
|
||||
|
||||
if (memcmp(update_buffs[i], temp_buffs[0], offset)) {
|
||||
printf("Fail rand ualign encode pad start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp
|
||||
(update_buffs[i] + offset + size, temp_buffs[0],
|
||||
PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign encode pad end\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
offset = temp_ubuffs[k + i] - temp_buffs[k + i];
|
||||
if (memcmp(temp_buffs[k + i], temp_buffs[0], offset)) {
|
||||
printf("Fail rand ualign decode pad start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp
|
||||
(temp_buffs[k + i] + offset + size, temp_buffs[0],
|
||||
PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign decode pad end\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test size alignment
|
||||
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 13 : ALIGN_SIZE;
|
||||
|
||||
for (size = TEST_LEN; size >= 0; size -= align) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
// Zero the destination buffer for update function
|
||||
for (i = k; i < TEST_SOURCES; i++) {
|
||||
memset(buffs[i], 0, size);
|
||||
memset(update_buffs[i], 0, size);
|
||||
}
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < size; j++) {
|
||||
buffs[i][j] = rand();
|
||||
update_buffs[i][j] = buffs[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
// The matrix generated by gf_gen_cauchy1_matrix
|
||||
// is always invertable.
|
||||
gf_gen_cauchy1_matrix(encode_matrix, m, k);
|
||||
|
||||
// Make parity vects
|
||||
// Generate g_tbls from encode matrix a
|
||||
ec_init_tables(k, m - k, &encode_matrix[k * k], g_tbls);
|
||||
// Perform matrix dot_prod for EC encoding
|
||||
// using g_tbls from encode matrix a
|
||||
REF_FUNCTION(size, k, m - k, g_tbls, buffs, &buffs[k]);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(size, k, m - k, i, g_tbls, update_buffs[i],
|
||||
&update_buffs[k]);
|
||||
}
|
||||
for (i = 0; i < m - k; i++) {
|
||||
if (0 != memcmp(update_buffs[k + i], buffs[k + i], size)) {
|
||||
printf("\nupdate_buffs%d (size=%d) :", i, size);
|
||||
dump(update_buffs[k + i], 25);
|
||||
printf("buffs%d (size=%d) :", i, size);
|
||||
dump(buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
gen_err_list(src_err_list, src_in_err, &nerrs, &nsrcerrs, k, m);
|
||||
// Generate decode matrix
|
||||
re = gf_gen_decode_matrix(encode_matrix, decode_matrix,
|
||||
invert_matrix, decode_index, src_err_list,
|
||||
src_in_err, nerrs, nsrcerrs, k, m);
|
||||
if (re != 0) {
|
||||
printf("Fail to gf_gen_decode_matrix\n");
|
||||
return -1;
|
||||
}
|
||||
// Pack recovery array as list of valid sources
|
||||
// Its order must be the same as the order
|
||||
// to generate matrix b in gf_gen_decode_matrix
|
||||
for (i = 0; i < k; i++) {
|
||||
recov[i] = update_buffs[decode_index[i]];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
memset(temp_buffs[i], 0, TEST_LEN);
|
||||
}
|
||||
ec_init_tables(k, nerrs, decode_matrix, g_tbls);
|
||||
for (i = 0; i < k; i++) {
|
||||
FUNCTION_UNDER_TEST(size, k, nerrs, i, g_tbls, recov[i],
|
||||
&temp_buffs[k]);
|
||||
}
|
||||
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
|
||||
if (0 !=
|
||||
memcmp(temp_buffs[k + i], update_buffs[src_err_list[i]], size)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (j = 0; j < nerrs; j++)
|
||||
printf(" %d", src_err_list[j]);
|
||||
printf(" - Index = ");
|
||||
for (p = 0; p < k; p++)
|
||||
printf(" %d", decode_index[p]);
|
||||
printf("\nencode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)encode_matrix, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((unsigned char *)invert_matrix, k, k);
|
||||
printf("\ndecode_matrix:\n");
|
||||
dump_u8xu8((unsigned char *)decode_matrix, m, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(update_buffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(update_buffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buffs[k + i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
printf("done EC tests: Pass\n");
|
||||
return 0;
|
||||
}
|
337
erasure_code/gf_2vect_dot_prod_avx.asm
Normal file
337
erasure_code/gf_2vect_dot_prod_avx.asm
Normal file
@ -0,0 +1,337 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_2vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r9
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 3*16 + 3*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
save_xmm128 xmm6, 0*16
|
||||
save_xmm128 xmm7, 1*16
|
||||
save_xmm128 xmm8, 2*16
|
||||
save_reg r12, 3*16 + 0*8
|
||||
save_reg r13, 3*16 + 1*8
|
||||
save_reg r14, 3*16 + 2*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
mov r12, [rsp + 3*16 + 0*8]
|
||||
mov r13, [rsp + 3*16 + 1*8]
|
||||
mov r14, [rsp + 3*16 + 2*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
|
||||
;;;================== High Address;
|
||||
;;; arg4
|
||||
;;; arg3
|
||||
;;; arg2
|
||||
;;; arg1
|
||||
;;; arg0
|
||||
;;; return
|
||||
;;;<================= esp of caller
|
||||
;;; ebp
|
||||
;;;<================= ebp = esp
|
||||
;;; var0
|
||||
;;; esi
|
||||
;;; edi
|
||||
;;; ebx
|
||||
;;;<================= esp of callee
|
||||
;;;
|
||||
;;;================== Low Address;
|
||||
|
||||
%define PS 4
|
||||
%define LOG_PS 2
|
||||
%define func(x) x:
|
||||
%define arg(x) [ebp + PS*2 + PS*x]
|
||||
%define var(x) [ebp - PS - PS*x]
|
||||
|
||||
%define trans ecx
|
||||
%define trans2 esi
|
||||
%define arg0 trans ;trans and trans2 are for the variables in stack
|
||||
%define arg0_m arg(0)
|
||||
%define arg1 ebx
|
||||
%define arg2 arg2_m
|
||||
%define arg2_m arg(2)
|
||||
%define arg3 trans
|
||||
%define arg3_m arg(3)
|
||||
%define arg4 trans
|
||||
%define arg4_m arg(4)
|
||||
%define tmp edx
|
||||
%define tmp2 edi
|
||||
%define tmp3 trans2
|
||||
%define tmp4 trans2
|
||||
%define tmp4_m var(0)
|
||||
%define return eax
|
||||
%macro SLDR 2 ;; stack load/restore
|
||||
mov %1, %2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
sub esp, PS*1 ;1 local variable
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
mov arg1, arg(1)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
add esp, PS*1 ;1 local variable
|
||||
pop ebp
|
||||
%endmacro
|
||||
|
||||
%endif ; output formats
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
|
||||
%define vec_i tmp2
|
||||
%define ptr tmp3
|
||||
%define dest2 tmp4
|
||||
%define pos return
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
%define len_m arg0_m
|
||||
%define src_m arg3_m
|
||||
%define dest1_m arg4_m
|
||||
%define dest2_m tmp4_m
|
||||
%endif
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%ifidn PS,8 ; 64-bit code
|
||||
default rel
|
||||
[bits 64]
|
||||
%endif
|
||||
|
||||
section .text
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
%define xmask0f xmm8
|
||||
%define xgft1_lo xmm7
|
||||
%define xgft1_hi xmm6
|
||||
%define xgft2_lo xmm5
|
||||
%define xgft2_hi xmm4
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm2
|
||||
%define xp2 xmm3
|
||||
%else ;32-bit code
|
||||
%define xmask0f xmm4
|
||||
%define xgft1_lo xmm7
|
||||
%define xgft1_hi xmm6
|
||||
%define xgft2_lo xgft1_lo
|
||||
%define xgft2_hi xgft1_hi
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm2
|
||||
%define xp2 xmm3
|
||||
%endif
|
||||
|
||||
align 16
|
||||
global gf_2vect_dot_prod_avx:function
|
||||
|
||||
func(gf_2vect_dot_prod_avx)
|
||||
FUNC_SAVE
|
||||
SLDR len, len_m
|
||||
sub len, 16
|
||||
SSTR len_m, len
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
SLDR dest1, dest1_m
|
||||
mov dest2, [dest1+PS]
|
||||
SSTR dest2_m, dest2
|
||||
mov dest1, [dest1]
|
||||
SSTR dest1_m, dest1
|
||||
|
||||
.loop16
|
||||
vpxor xp1, xp1
|
||||
vpxor xp2, xp2
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
.next_vect
|
||||
SLDR src, src_m
|
||||
mov ptr, [src+vec_i]
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
%ifidn PS,8 ; 64-bit code
|
||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%endif
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
%ifidn PS,4 ; 32-bit code
|
||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%endif
|
||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
vpxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
SLDR dest1, dest1_m
|
||||
SLDR dest2, dest2_m
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
|
||||
SLDR len, len_m
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop16 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_2vect_dot_prod_avx, 02, 05, 0191
|
356
erasure_code/gf_2vect_dot_prod_avx2.asm
Normal file
356
erasure_code/gf_2vect_dot_prod_avx2.asm
Normal file
@ -0,0 +1,356 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_2vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 r9
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 3*16 + 3*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
vmovdqa [rsp + 0*16], xmm6
|
||||
vmovdqa [rsp + 1*16], xmm7
|
||||
vmovdqa [rsp + 2*16], xmm8
|
||||
save_reg r12, 3*16 + 0*8
|
||||
save_reg r13, 3*16 + 1*8
|
||||
save_reg r14, 3*16 + 2*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
mov r12, [rsp + 3*16 + 0*8]
|
||||
mov r13, [rsp + 3*16 + 1*8]
|
||||
mov r14, [rsp + 3*16 + 2*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
|
||||
;;;================== High Address;
|
||||
;;; arg4
|
||||
;;; arg3
|
||||
;;; arg2
|
||||
;;; arg1
|
||||
;;; arg0
|
||||
;;; return
|
||||
;;;<================= esp of caller
|
||||
;;; ebp
|
||||
;;;<================= ebp = esp
|
||||
;;; var0
|
||||
;;; esi
|
||||
;;; edi
|
||||
;;; ebx
|
||||
;;;<================= esp of callee
|
||||
;;;
|
||||
;;;================== Low Address;
|
||||
|
||||
%define PS 4
|
||||
%define LOG_PS 2
|
||||
%define func(x) x:
|
||||
%define arg(x) [ebp + PS*2 + PS*x]
|
||||
%define var(x) [ebp - PS - PS*x]
|
||||
|
||||
%define trans ecx
|
||||
%define trans2 esi
|
||||
%define arg0 trans ;trans and trans2 are for the variables in stack
|
||||
%define arg0_m arg(0)
|
||||
%define arg1 ebx
|
||||
%define arg2 arg2_m
|
||||
%define arg2_m arg(2)
|
||||
%define arg3 trans
|
||||
%define arg3_m arg(3)
|
||||
%define arg4 trans
|
||||
%define arg4_m arg(4)
|
||||
%define tmp edx
|
||||
%define tmp.w edx
|
||||
%define tmp.b dl
|
||||
%define tmp2 edi
|
||||
%define tmp3 trans2
|
||||
%define tmp4 trans2
|
||||
%define tmp4_m var(0)
|
||||
%define return eax
|
||||
%macro SLDR 2 ;stack load/restore
|
||||
mov %1, %2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
sub esp, PS*1 ;1 local variable
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
mov arg1, arg(1)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
add esp, PS*1 ;1 local variable
|
||||
pop ebp
|
||||
%endmacro
|
||||
|
||||
%endif ; output formats
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
|
||||
%define vec_i tmp2
|
||||
%define ptr tmp3
|
||||
%define dest2 tmp4
|
||||
%define pos return
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
%define len_m arg0_m
|
||||
%define src_m arg3_m
|
||||
%define dest1_m arg4_m
|
||||
%define dest2_m tmp4_m
|
||||
%endif
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
default rel
|
||||
[bits 64]
|
||||
%endif
|
||||
|
||||
section .text
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
%define xmask0f ymm8
|
||||
%define xmask0fx xmm8
|
||||
%define xgft1_lo ymm7
|
||||
%define xgft1_hi ymm6
|
||||
%define xgft2_lo ymm5
|
||||
%define xgft2_hi ymm4
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xp1 ymm2
|
||||
%define xp2 ymm3
|
||||
%else ;32-bit code
|
||||
%define xmask0f ymm7
|
||||
%define xmask0fx xmm7
|
||||
%define xgft1_lo ymm5
|
||||
%define xgft1_hi ymm4
|
||||
%define xgft2_lo xgft1_lo
|
||||
%define xgft2_hi xgft1_hi
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xp1 ymm2
|
||||
%define xp2 ymm3
|
||||
|
||||
%endif
|
||||
|
||||
align 16
|
||||
global gf_2vect_dot_prod_avx2:function
|
||||
|
||||
func(gf_2vect_dot_prod_avx2)
|
||||
FUNC_SAVE
|
||||
SLDR len, len_m
|
||||
sub len, 32
|
||||
SSTR len_m, len
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
mov tmp.b, 0x0f
|
||||
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
|
||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
SLDR dest1, dest1_m
|
||||
mov dest2, [dest1+PS]
|
||||
SSTR dest2_m, dest2
|
||||
mov dest1, [dest1]
|
||||
SSTR dest1_m, dest1
|
||||
|
||||
.loop32
|
||||
vpxor xp1, xp1
|
||||
vpxor xp2, xp2
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
.next_vect
|
||||
SLDR src, src_m
|
||||
mov ptr, [src+vec_i]
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
|
||||
%ifidn PS,8 ; 64-bit code
|
||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
|
||||
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%else
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
%endif
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
%ifidn PS,4 ; 32-bit code
|
||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%endif
|
||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
vpxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
SLDR dest1, dest1_m
|
||||
SLDR dest2, dest2_m
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
|
||||
SLDR len, len_m
|
||||
add pos, 32 ;Loop on 32 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop32
|
||||
|
||||
lea tmp, [len + 32]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop32 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_2vect_dot_prod_avx2, 04, 05, 0196
|
339
erasure_code/gf_2vect_dot_prod_sse.asm
Normal file
339
erasure_code/gf_2vect_dot_prod_sse.asm
Normal file
@ -0,0 +1,339 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_2vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r9
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 3*16 + 3*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
save_xmm128 xmm6, 0*16
|
||||
save_xmm128 xmm7, 1*16
|
||||
save_xmm128 xmm8, 2*16
|
||||
save_reg r12, 3*16 + 0*8
|
||||
save_reg r13, 3*16 + 1*8
|
||||
save_reg r14, 3*16 + 2*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp + 0*16]
|
||||
movdqa xmm7, [rsp + 1*16]
|
||||
movdqa xmm8, [rsp + 2*16]
|
||||
mov r12, [rsp + 3*16 + 0*8]
|
||||
mov r13, [rsp + 3*16 + 1*8]
|
||||
mov r14, [rsp + 3*16 + 2*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
|
||||
;;;================== High Address;
|
||||
;;; arg4
|
||||
;;; arg3
|
||||
;;; arg2
|
||||
;;; arg1
|
||||
;;; arg0
|
||||
;;; return
|
||||
;;;<================= esp of caller
|
||||
;;; ebp
|
||||
;;;<================= ebp = esp
|
||||
;;; var0
|
||||
;;; esi
|
||||
;;; edi
|
||||
;;; ebx
|
||||
;;;<================= esp of callee
|
||||
;;;
|
||||
;;;================== Low Address;
|
||||
|
||||
%define PS 4
|
||||
%define LOG_PS 2
|
||||
%define func(x) x:
|
||||
%define arg(x) [ebp + PS*2 + PS*x]
|
||||
%define var(x) [ebp - PS - PS*x]
|
||||
|
||||
%define trans ecx
|
||||
%define trans2 esi
|
||||
%define arg0 trans ;trans and trans2 are for the variables in stack
|
||||
%define arg0_m arg(0)
|
||||
%define arg1 ebx
|
||||
%define arg2 arg2_m
|
||||
%define arg2_m arg(2)
|
||||
%define arg3 trans
|
||||
%define arg3_m arg(3)
|
||||
%define arg4 trans
|
||||
%define arg4_m arg(4)
|
||||
%define tmp edx
|
||||
%define tmp2 edi
|
||||
%define tmp3 trans2
|
||||
%define tmp4 trans2
|
||||
%define tmp4_m var(0)
|
||||
%define return eax
|
||||
%macro SLDR 2 ;; stack load/restore
|
||||
mov %1, %2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
sub esp, PS*1 ;1 local variable
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
mov arg1, arg(1)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
add esp, PS*1 ;1 local variable
|
||||
pop ebp
|
||||
%endmacro
|
||||
|
||||
%endif ; output formats
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
|
||||
%define vec_i tmp2
|
||||
%define ptr tmp3
|
||||
%define dest2 tmp4
|
||||
%define pos return
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
%define len_m arg0_m
|
||||
%define src_m arg3_m
|
||||
%define dest1_m arg4_m
|
||||
%define dest2_m tmp4_m
|
||||
%endif
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR movdqu
|
||||
%define XSTR movdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR movdqa
|
||||
%define XSTR movdqa
|
||||
%else
|
||||
%define XLDR movntdqa
|
||||
%define XSTR movntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
default rel
|
||||
[bits 64]
|
||||
%endif
|
||||
|
||||
section .text
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
%define xmask0f xmm8
|
||||
%define xgft1_lo xmm7
|
||||
%define xgft1_hi xmm6
|
||||
%define xgft2_lo xmm5
|
||||
%define xgft2_hi xmm4
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm2
|
||||
%define xp2 xmm3
|
||||
%else ;32-bit code
|
||||
%define xmask0f xmm4
|
||||
%define xgft1_lo xmm7
|
||||
%define xgft1_hi xmm6
|
||||
%define xgft2_lo xgft1_lo
|
||||
%define xgft2_hi xgft1_hi
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm2
|
||||
%define xp2 xmm3
|
||||
%endif
|
||||
|
||||
align 16
|
||||
global gf_2vect_dot_prod_sse:function
|
||||
|
||||
func(gf_2vect_dot_prod_sse)
|
||||
FUNC_SAVE
|
||||
SLDR len, len_m
|
||||
sub len, 16
|
||||
SSTR len_m, len
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
SLDR dest1, dest1_m
|
||||
mov dest2, [dest1+PS]
|
||||
SSTR dest2_m, dest2
|
||||
mov dest1, [dest1]
|
||||
SSTR dest1_m, dest1
|
||||
|
||||
.loop16
|
||||
pxor xp1, xp1
|
||||
pxor xp2, xp2
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
.next_vect
|
||||
SLDR src, src_m
|
||||
mov ptr, [src+vec_i]
|
||||
|
||||
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
%ifidn PS,8 ;64-bit code
|
||||
movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%endif
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
pxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%endif
|
||||
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
pxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
SLDR dest1, dest1_m
|
||||
SLDR dest2, dest2_m
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
|
||||
SLDR len, len_m
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop16 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_2vect_dot_prod_sse, 00, 04, 0062
|
216
erasure_code/gf_2vect_dot_prod_sse_perf.c
Normal file
216
erasure_code/gf_2vect_dot_prod_sse_perf.c
Normal file
@ -0,0 +1,216 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_2vect_dot_prod_sse
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 10
|
||||
# define TEST_LEN 8*1024
|
||||
# define TEST_LOOPS 40000
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 10
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
|
||||
# define TEST_LOOPS 100
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j;
|
||||
void *buf;
|
||||
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g_tbls[2 * TEST_SOURCES * 32];
|
||||
u8 *dest1, *dest2, *dest_ref1, *dest_ref2, *dest_ptrs[2];
|
||||
u8 *buffs[TEST_SOURCES];
|
||||
struct perf start, stop;
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref2 = buf;
|
||||
|
||||
dest_ptrs[0] = dest1;
|
||||
dest_ptrs[1] = dest2;
|
||||
|
||||
// Performance test
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
memset(dest1, 0, TEST_LEN);
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest_ref1, 0, TEST_LEN);
|
||||
memset(dest_ref2, 0, TEST_LEN);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
}
|
||||
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
|
||||
dest_ref2);
|
||||
|
||||
#ifdef DO_REF_PERF
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS / 100; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
buffs, dest_ref2);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("gf_2vect_dot_prod_base" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 2) * i);
|
||||
#endif
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 2) * i);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("pass perf check\n");
|
||||
return 0;
|
||||
|
||||
}
|
477
erasure_code/gf_2vect_dot_prod_sse_test.c
Normal file
477
erasure_code/gf_2vect_dot_prod_sse_test.c
Normal file
@ -0,0 +1,477 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_2vect_dot_prod_sse
|
||||
#endif
|
||||
#ifndef TEST_MIN_SIZE
|
||||
# define TEST_MIN_SIZE 16
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
#define TEST_MEM TEST_SIZE
|
||||
#define TEST_LOOPS 10000
|
||||
#define TEST_TYPE_STR ""
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 16
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 20
|
||||
#endif
|
||||
|
||||
#ifdef EC_ALIGNED_ADDR
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 0
|
||||
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
|
||||
#else
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 32
|
||||
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, srcs;
|
||||
void *buf;
|
||||
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g_tbls[2 * TEST_SOURCES * 32];
|
||||
u8 *dest1, *dest2, *dest_ref1, *dest_ref2, *dest_ptrs[2];
|
||||
u8 *buffs[TEST_SOURCES];
|
||||
|
||||
int align, size;
|
||||
unsigned char *efence_buffs[TEST_SOURCES];
|
||||
unsigned int offset;
|
||||
u8 *ubuffs[TEST_SOURCES];
|
||||
u8 *udest_ptrs[2];
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref2 = buf;
|
||||
|
||||
dest_ptrs[0] = dest1;
|
||||
dest_ptrs[1] = dest2;
|
||||
|
||||
// Test of all zeros
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
memset(buffs[i], 0, TEST_LEN);
|
||||
|
||||
memset(dest1, 0, TEST_LEN);
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest_ref1, 0, TEST_LEN);
|
||||
memset(dest_ref2, 0, TEST_LEN);
|
||||
memset(g1, 2, TEST_SOURCES);
|
||||
memset(g2, 1, TEST_SOURCES);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
|
||||
dest_ref2);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
|
||||
// Rand data test
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
buffs, dest_ref2);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Rand data test with varied parameters
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
|
||||
dest_ref2);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test1 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test2 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
|
||||
efence_buffs[i] = buffs[i] + TEST_LEN - size;
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref2);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref2, dest2, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test rand ptr alignment if available
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
|
||||
srcs = rand() % TEST_SOURCES;
|
||||
if (srcs == 0)
|
||||
continue;
|
||||
|
||||
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
|
||||
// Add random offsets
|
||||
for (i = 0; i < srcs; i++)
|
||||
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
memset(dest1, 0, TEST_LEN); // zero pad to check write-over
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
ubuffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
|
||||
|
||||
if (memcmp(dest_ref1, udest_ptrs[0], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[0], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref2, udest_ptrs[1], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[1], 25);
|
||||
return -1;
|
||||
}
|
||||
// Confirm that padding around dests is unchanged
|
||||
memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
|
||||
offset = udest_ptrs[0] - dest1;
|
||||
|
||||
if (memcmp(dest1, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad1 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad1 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[1] - dest2;
|
||||
if (memcmp(dest2, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad2 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad2 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test all size alignment
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
|
||||
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
|
||||
srcs = TEST_SOURCES;
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (memcmp(dest_ref1, dest_ptrs[0], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[0], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref2, dest_ptrs[1], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[1], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("Pass\n");
|
||||
return 0;
|
||||
|
||||
}
|
236
erasure_code/gf_2vect_mad_avx.asm
Normal file
236
erasure_code/gf_2vect_mad_avx.asm
Normal file
@ -0,0 +1,236 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_2vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*9 + 3*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
movdqa [rsp+16*0],xmm6
|
||||
movdqa [rsp+16*1],xmm7
|
||||
movdqa [rsp+16*2],xmm8
|
||||
movdqa [rsp+16*3],xmm9
|
||||
movdqa [rsp+16*4],xmm10
|
||||
movdqa [rsp+16*5],xmm11
|
||||
movdqa [rsp+16*6],xmm12
|
||||
movdqa [rsp+16*7],xmm13
|
||||
movdqa [rsp+16*8],xmm14
|
||||
save_reg r12, 9*16 + 0*8
|
||||
save_reg r15, 9*16 + 1*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp+16*0]
|
||||
movdqa xmm7, [rsp+16*1]
|
||||
movdqa xmm8, [rsp+16*2]
|
||||
movdqa xmm9, [rsp+16*3]
|
||||
movdqa xmm10, [rsp+16*4]
|
||||
movdqa xmm11, [rsp+16*5]
|
||||
movdqa xmm12, [rsp+16*6]
|
||||
movdqa xmm13, [rsp+16*7]
|
||||
movdqa xmm14, [rsp+16*8]
|
||||
mov r12, [rsp + 9*16 + 0*8]
|
||||
mov r15, [rsp + 9*16 + 1*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
;;; gf_2vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 tmp2
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm14
|
||||
%define xgft1_lo xmm13
|
||||
%define xgft1_hi xmm12
|
||||
%define xgft2_lo xmm11
|
||||
%define xgft2_hi xmm10
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xtmph1 xmm2
|
||||
%define xtmpl1 xmm3
|
||||
%define xtmph2 xmm4
|
||||
%define xtmpl2 xmm5
|
||||
%define xd1 xmm6
|
||||
%define xd2 xmm7
|
||||
%define xtmpd1 xmm8
|
||||
%define xtmpd2 xmm9
|
||||
|
||||
|
||||
align 16
|
||||
global gf_2vect_mad_avx:function
|
||||
|
||||
func(gf_2vect_mad_avx)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
|
||||
xor pos, pos
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
sal vec, 5
|
||||
lea tmp, [mul_array + vec_i]
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
vmovdqu xgft2_hi, [tmp+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
|
||||
mov dest2, [dest1+PS]
|
||||
mov dest1, [dest1]
|
||||
|
||||
XLDR xtmpd1, [dest1+len] ;backup the last 16 bytes in dest
|
||||
XLDR xtmpd2, [dest2+len] ;backup the last 16 bytes in dest
|
||||
|
||||
.loop16
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
XLDR xd2, [dest2+pos] ;Get next dest vector
|
||||
.loop16_overlap:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
|
||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
||||
|
||||
vpshufb xtmph2, xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl2, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
|
||||
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
||||
|
||||
XSTR [dest1+pos], xd1
|
||||
XSTR [dest2+pos], xd2
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
vmovdqa xd1, xtmpd1 ;Restore xd1
|
||||
vmovdqa xd2, xtmpd2 ;Restore xd2
|
||||
jmp .loop16_overlap ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_2vect_mad_avx, 02, 01, 0204
|
247
erasure_code/gf_2vect_mad_avx2.asm
Normal file
247
erasure_code/gf_2vect_mad_avx2.asm
Normal file
@ -0,0 +1,247 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_2vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*9 + 3*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
vmovdqa [rsp+16*0],xmm6
|
||||
vmovdqa [rsp+16*1],xmm7
|
||||
vmovdqa [rsp+16*2],xmm8
|
||||
vmovdqa [rsp+16*3],xmm9
|
||||
vmovdqa [rsp+16*4],xmm10
|
||||
vmovdqa [rsp+16*5],xmm11
|
||||
vmovdqa [rsp+16*6],xmm12
|
||||
vmovdqa [rsp+16*7],xmm13
|
||||
vmovdqa [rsp+16*8],xmm14
|
||||
save_reg r12, 9*16 + 0*8
|
||||
save_reg r15, 9*16 + 1*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp+16*0]
|
||||
vmovdqa xmm7, [rsp+16*1]
|
||||
vmovdqa xmm8, [rsp+16*2]
|
||||
vmovdqa xmm9, [rsp+16*3]
|
||||
vmovdqa xmm10, [rsp+16*4]
|
||||
vmovdqa xmm11, [rsp+16*5]
|
||||
vmovdqa xmm12, [rsp+16*6]
|
||||
vmovdqa xmm13, [rsp+16*7]
|
||||
vmovdqa xmm14, [rsp+16*8]
|
||||
mov r12, [rsp + 9*16 + 0*8]
|
||||
mov r15, [rsp + 9*16 + 1*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
;;; gf_2vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 tmp2
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f ymm14
|
||||
%define xmask0fx xmm14
|
||||
%define xgft1_lo ymm13
|
||||
%define xgft1_hi ymm12
|
||||
%define xgft2_lo ymm11
|
||||
%define xgft2_hi ymm10
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xtmph1 ymm2
|
||||
%define xtmpl1 ymm3
|
||||
%define xtmph2 ymm4
|
||||
%define xtmpl2 ymm5
|
||||
%define xd1 ymm6
|
||||
%define xd2 ymm7
|
||||
%define xtmpd1 ymm8
|
||||
%define xtmpd2 ymm9
|
||||
|
||||
align 16
|
||||
global gf_2vect_mad_avx2:function
|
||||
|
||||
func(gf_2vect_mad_avx2)
|
||||
FUNC_SAVE
|
||||
sub len, 32
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
mov tmp.b, 0x0f
|
||||
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
|
||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
sal vec, 5
|
||||
lea tmp, [mul_array + vec_i]
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
|
||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
|
||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
|
||||
mov dest2, [dest1+PS] ; reuse mul_array
|
||||
mov dest1, [dest1]
|
||||
|
||||
XLDR xtmpd1, [dest1+len] ;backup the last 16 bytes in dest
|
||||
XLDR xtmpd2, [dest2+len] ;backup the last 16 bytes in dest
|
||||
|
||||
.loop32
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
XLDR xd2, [dest2+pos] ;Get next dest vector
|
||||
.loop32_overlap:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
|
||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
||||
|
||||
vpshufb xtmph2, xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl2, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
|
||||
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
||||
|
||||
XSTR [dest1+pos], xd1
|
||||
XSTR [dest2+pos], xd2
|
||||
|
||||
add pos, 32 ;Loop on 32 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop32
|
||||
|
||||
lea tmp, [len + 32]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-32
|
||||
vmovdqa xd1, xtmpd1 ;Restore xd1
|
||||
vmovdqa xd2, xtmpd2 ;Restore xd2
|
||||
jmp .loop32_overlap ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_2vect_mad_avx2, 04, 01, 0205
|
239
erasure_code/gf_2vect_mad_sse.asm
Normal file
239
erasure_code/gf_2vect_mad_sse.asm
Normal file
@ -0,0 +1,239 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*9 + 3*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
movdqa [rsp+16*0],xmm6
|
||||
movdqa [rsp+16*1],xmm7
|
||||
movdqa [rsp+16*2],xmm8
|
||||
movdqa [rsp+16*3],xmm9
|
||||
movdqa [rsp+16*4],xmm10
|
||||
movdqa [rsp+16*5],xmm11
|
||||
movdqa [rsp+16*6],xmm12
|
||||
movdqa [rsp+16*7],xmm13
|
||||
movdqa [rsp+16*8],xmm14
|
||||
save_reg r12, 9*16 + 0*8
|
||||
save_reg r15, 9*16 + 1*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp+16*0]
|
||||
movdqa xmm7, [rsp+16*1]
|
||||
movdqa xmm8, [rsp+16*2]
|
||||
movdqa xmm9, [rsp+16*3]
|
||||
movdqa xmm10, [rsp+16*4]
|
||||
movdqa xmm11, [rsp+16*5]
|
||||
movdqa xmm12, [rsp+16*6]
|
||||
movdqa xmm13, [rsp+16*7]
|
||||
movdqa xmm14, [rsp+16*8]
|
||||
mov r12, [rsp + 9*16 + 0*8]
|
||||
mov r15, [rsp + 9*16 + 1*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 tmp2
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR movdqu
|
||||
%define XSTR movdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR movdqa
|
||||
%define XSTR movdqa
|
||||
%else
|
||||
%define XLDR movntdqa
|
||||
%define XSTR movntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm14
|
||||
%define xgft1_lo xmm13
|
||||
%define xgft1_hi xmm12
|
||||
%define xgft2_lo xmm11
|
||||
%define xgft2_hi xmm10
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xtmph1 xmm2
|
||||
%define xtmpl1 xmm3
|
||||
%define xtmph2 xmm4
|
||||
%define xtmpl2 xmm5
|
||||
%define xd1 xmm6
|
||||
%define xd2 xmm7
|
||||
%define xtmpd1 xmm8
|
||||
%define xtmpd2 xmm9
|
||||
|
||||
|
||||
align 16
|
||||
global gf_2vect_mad_sse:function
|
||||
func(gf_2vect_mad_sse)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
|
||||
xor pos, pos
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
sal vec, 5
|
||||
lea tmp, [mul_array + vec_i]
|
||||
movdqu xgft1_lo,[tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
movdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
movdqu xgft2_hi, [tmp+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
mov dest2, [dest1+PS]
|
||||
mov dest1, [dest1]
|
||||
|
||||
XLDR xtmpd1, [dest1+len] ;backup the last 16 bytes in dest
|
||||
XLDR xtmpd2, [dest2+len] ;backup the last 16 bytes in dest
|
||||
|
||||
.loop16:
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
XLDR xd2, [dest2+pos] ;Get next dest vector
|
||||
.loop16_overlap:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
movdqa xtmph1, xgft1_hi ;Reload const array registers
|
||||
movdqa xtmpl1, xgft1_lo
|
||||
movdqa xtmph2, xgft2_hi ;Reload const array registers
|
||||
movdqa xtmpl2, xgft2_lo
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
pshufb xtmph1, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph1, xtmpl1 ;GF add high and low partials
|
||||
pxor xd1, xtmph1
|
||||
|
||||
pshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph2, xtmpl2 ;GF add high and low partials
|
||||
pxor xd2, xtmph2
|
||||
|
||||
XSTR [dest1+pos], xd1 ;Store result
|
||||
XSTR [dest2+pos], xd2 ;Store result
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
movdqa xd1, xtmpd1 ;Restore xd1
|
||||
movdqa xd2, xtmpd2 ;Restore xd2
|
||||
jmp .loop16_overlap ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
|
||||
mask0f:
|
||||
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_2vect_mad_sse, 00, 01, 0203
|
377
erasure_code/gf_3vect_dot_prod_avx.asm
Normal file
377
erasure_code/gf_3vect_dot_prod_avx.asm
Normal file
@ -0,0 +1,377 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_3vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 6*16 + 5*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
save_xmm128 xmm6, 0*16
|
||||
save_xmm128 xmm7, 1*16
|
||||
save_xmm128 xmm8, 2*16
|
||||
save_xmm128 xmm9, 3*16
|
||||
save_xmm128 xmm10, 4*16
|
||||
save_xmm128 xmm11, 5*16
|
||||
save_reg r12, 6*16 + 0*8
|
||||
save_reg r13, 6*16 + 1*8
|
||||
save_reg r14, 6*16 + 2*8
|
||||
save_reg r15, 6*16 + 3*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
vmovdqa xmm9, [rsp + 3*16]
|
||||
vmovdqa xmm10, [rsp + 4*16]
|
||||
vmovdqa xmm11, [rsp + 5*16]
|
||||
mov r12, [rsp + 6*16 + 0*8]
|
||||
mov r13, [rsp + 6*16 + 1*8]
|
||||
mov r14, [rsp + 6*16 + 2*8]
|
||||
mov r15, [rsp + 6*16 + 3*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
|
||||
;;;================== High Address;
|
||||
;;; arg4
|
||||
;;; arg3
|
||||
;;; arg2
|
||||
;;; arg1
|
||||
;;; arg0
|
||||
;;; return
|
||||
;;;<================= esp of caller
|
||||
;;; ebp
|
||||
;;;<================= ebp = esp
|
||||
;;; var0
|
||||
;;; var1
|
||||
;;; esi
|
||||
;;; edi
|
||||
;;; ebx
|
||||
;;;<================= esp of callee
|
||||
;;;
|
||||
;;;================== Low Address;
|
||||
|
||||
%define PS 4
|
||||
%define LOG_PS 2
|
||||
%define func(x) x:
|
||||
%define arg(x) [ebp + PS*2 + PS*x]
|
||||
%define var(x) [ebp - PS - PS*x]
|
||||
|
||||
%define trans ecx
|
||||
%define trans2 esi
|
||||
%define arg0 trans ;trans and trans2 are for the variables in stack
|
||||
%define arg0_m arg(0)
|
||||
%define arg1 ebx
|
||||
%define arg2 arg2_m
|
||||
%define arg2_m arg(2)
|
||||
%define arg3 trans
|
||||
%define arg3_m arg(3)
|
||||
%define arg4 trans
|
||||
%define arg4_m arg(4)
|
||||
%define arg5 trans2
|
||||
%define tmp edx
|
||||
%define tmp2 edi
|
||||
%define tmp3 trans2
|
||||
%define tmp3_m var(0)
|
||||
%define tmp4 trans2
|
||||
%define tmp4_m var(1)
|
||||
%define return eax
|
||||
%macro SLDR 2 ;; stack load/restore
|
||||
mov %1, %2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
sub esp, PS*2 ;2 local variables
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
mov arg1, arg(1)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
add esp, PS*2 ;2 local variables
|
||||
pop ebp
|
||||
%endmacro
|
||||
|
||||
%endif ; output formats
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
%define ptr arg5
|
||||
|
||||
%define vec_i tmp2
|
||||
%define dest2 tmp3
|
||||
%define dest3 tmp4
|
||||
%define pos return
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
%define len_m arg0_m
|
||||
%define src_m arg3_m
|
||||
%define dest1_m arg4_m
|
||||
%define dest2_m tmp3_m
|
||||
%define dest3_m tmp4_m
|
||||
%endif
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%ifidn PS,8 ; 64-bit code
|
||||
default rel
|
||||
[bits 64]
|
||||
%endif
|
||||
|
||||
|
||||
section .text
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
%define xmask0f xmm11
|
||||
%define xgft1_lo xmm10
|
||||
%define xgft1_hi xmm9
|
||||
%define xgft2_lo xmm8
|
||||
%define xgft2_hi xmm7
|
||||
%define xgft3_lo xmm6
|
||||
%define xgft3_hi xmm5
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm2
|
||||
%define xp2 xmm3
|
||||
%define xp3 xmm4
|
||||
%else
|
||||
%define xmask0f xmm7
|
||||
%define xgft1_lo xmm6
|
||||
%define xgft1_hi xmm5
|
||||
%define xgft2_lo xgft1_lo
|
||||
%define xgft2_hi xgft1_hi
|
||||
%define xgft3_lo xgft1_lo
|
||||
%define xgft3_hi xgft1_hi
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm2
|
||||
%define xp2 xmm3
|
||||
%define xp3 xmm4
|
||||
%endif
|
||||
|
||||
align 16
|
||||
global gf_3vect_dot_prod_avx:function
|
||||
func(gf_3vect_dot_prod_avx)
|
||||
FUNC_SAVE
|
||||
SLDR len, len_m
|
||||
sub len, 16
|
||||
SSTR len_m, len
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
SLDR dest1, dest1_m
|
||||
mov dest2, [dest1+PS]
|
||||
SSTR dest2_m, dest2
|
||||
mov dest3, [dest1+2*PS]
|
||||
SSTR dest3_m, dest3
|
||||
mov dest1, [dest1]
|
||||
SSTR dest1_m, dest1
|
||||
|
||||
.loop16:
|
||||
vpxor xp1, xp1
|
||||
vpxor xp2, xp2
|
||||
vpxor xp3, xp3
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
.next_vect:
|
||||
SLDR src, src_m
|
||||
mov ptr, [src+vec_i]
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
%ifidn PS,8 ; 64-bit code
|
||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
vmovdqu xgft3_hi, [tmp+vec*(64/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%endif
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
%ifidn PS,4 ; 32-bit code
|
||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
%endif
|
||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
vpxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
%ifidn PS,4 ; 32-bit code
|
||||
sal vec, 1
|
||||
vmovdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
vmovdqu xgft3_hi, [tmp+vec*(32/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
sar vec, 1
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%endif
|
||||
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
vpxor xp3, xgft3_hi ;xp3 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
SLDR dest1, dest1_m
|
||||
SLDR dest2, dest2_m
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
SLDR dest3, dest3_m
|
||||
XSTR [dest3+pos], xp3
|
||||
|
||||
SLDR len, len_m
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop16 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_3vect_dot_prod_avx, 02, 05, 0192
|
397
erasure_code/gf_3vect_dot_prod_avx2.asm
Normal file
397
erasure_code/gf_3vect_dot_prod_avx2.asm
Normal file
@ -0,0 +1,397 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_3vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 6*16 + 5*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
vmovdqa [rsp + 0*16], xmm6
|
||||
vmovdqa [rsp + 1*16], xmm7
|
||||
vmovdqa [rsp + 2*16], xmm8
|
||||
vmovdqa [rsp + 3*16], xmm9
|
||||
vmovdqa [rsp + 4*16], xmm10
|
||||
vmovdqa [rsp + 5*16], xmm11
|
||||
save_reg r12, 6*16 + 0*8
|
||||
save_reg r13, 6*16 + 1*8
|
||||
save_reg r14, 6*16 + 2*8
|
||||
save_reg r15, 6*16 + 3*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
vmovdqa xmm9, [rsp + 3*16]
|
||||
vmovdqa xmm10, [rsp + 4*16]
|
||||
vmovdqa xmm11, [rsp + 5*16]
|
||||
mov r12, [rsp + 6*16 + 0*8]
|
||||
mov r13, [rsp + 6*16 + 1*8]
|
||||
mov r14, [rsp + 6*16 + 2*8]
|
||||
mov r15, [rsp + 6*16 + 3*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
|
||||
;;;================== High Address;
|
||||
;;; arg4
|
||||
;;; arg3
|
||||
;;; arg2
|
||||
;;; arg1
|
||||
;;; arg0
|
||||
;;; return
|
||||
;;;<================= esp of caller
|
||||
;;; ebp
|
||||
;;;<================= ebp = esp
|
||||
;;; var0
|
||||
;;; var1
|
||||
;;; esi
|
||||
;;; edi
|
||||
;;; ebx
|
||||
;;;<================= esp of callee
|
||||
;;;
|
||||
;;;================== Low Address;
|
||||
|
||||
%define PS 4
|
||||
%define LOG_PS 2
|
||||
%define func(x) x:
|
||||
%define arg(x) [ebp + PS*2 + PS*x]
|
||||
%define var(x) [ebp - PS - PS*x]
|
||||
|
||||
%define trans ecx
|
||||
%define trans2 esi
|
||||
%define arg0 trans ;trans and trans2 are for the variables in stack
|
||||
%define arg0_m arg(0)
|
||||
%define arg1 ebx
|
||||
%define arg2 arg2_m
|
||||
%define arg2_m arg(2)
|
||||
%define arg3 trans
|
||||
%define arg3_m arg(3)
|
||||
%define arg4 trans
|
||||
%define arg4_m arg(4)
|
||||
%define arg5 trans2
|
||||
%define tmp edx
|
||||
%define tmp.w edx
|
||||
%define tmp.b dl
|
||||
%define tmp2 edi
|
||||
%define tmp3 trans2
|
||||
%define tmp3_m var(0)
|
||||
%define tmp4 trans2
|
||||
%define tmp4_m var(1)
|
||||
%define return eax
|
||||
%macro SLDR 2 ;stack load/restore
|
||||
mov %1, %2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
sub esp, PS*2 ;2 local variables
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
mov arg1, arg(1)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
add esp, PS*2 ;2 local variables
|
||||
pop ebp
|
||||
%endmacro
|
||||
|
||||
%endif ; output formats
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
%define ptr arg5
|
||||
|
||||
%define vec_i tmp2
|
||||
%define dest2 tmp3
|
||||
%define dest3 tmp4
|
||||
%define pos return
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
%define len_m arg0_m
|
||||
%define src_m arg3_m
|
||||
%define dest1_m arg4_m
|
||||
%define dest2_m tmp3_m
|
||||
%define dest3_m tmp4_m
|
||||
%endif
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
default rel
|
||||
[bits 64]
|
||||
%endif
|
||||
|
||||
section .text
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
%define xmask0f ymm11
|
||||
%define xmask0fx xmm11
|
||||
%define xgft1_lo ymm10
|
||||
%define xgft1_hi ymm9
|
||||
%define xgft2_lo ymm8
|
||||
%define xgft2_hi ymm7
|
||||
%define xgft3_lo ymm6
|
||||
%define xgft3_hi ymm5
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xp1 ymm2
|
||||
%define xp2 ymm3
|
||||
%define xp3 ymm4
|
||||
%else
|
||||
%define xmask0f ymm7
|
||||
%define xmask0fx xmm7
|
||||
%define xgft1_lo ymm6
|
||||
%define xgft1_hi ymm5
|
||||
%define xgft2_lo xgft1_lo
|
||||
%define xgft2_hi xgft1_hi
|
||||
%define xgft3_lo xgft1_lo
|
||||
%define xgft3_hi xgft1_hi
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xp1 ymm2
|
||||
%define xp2 ymm3
|
||||
%define xp3 ymm4
|
||||
|
||||
%endif
|
||||
|
||||
align 16
|
||||
global gf_3vect_dot_prod_avx2:function
|
||||
func(gf_3vect_dot_prod_avx2)
|
||||
FUNC_SAVE
|
||||
SLDR len, len_m
|
||||
sub len, 32
|
||||
SSTR len_m, len
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
mov tmp.b, 0x0f
|
||||
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
|
||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
SLDR dest1, dest1_m
|
||||
mov dest2, [dest1+PS]
|
||||
SSTR dest2_m, dest2
|
||||
mov dest3, [dest1+2*PS]
|
||||
SSTR dest3_m, dest3
|
||||
mov dest1, [dest1]
|
||||
SSTR dest1_m, dest1
|
||||
|
||||
.loop32:
|
||||
vpxor xp1, xp1
|
||||
vpxor xp2, xp2
|
||||
vpxor xp3, xp3
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
.next_vect:
|
||||
SLDR src, src_m
|
||||
mov ptr, [src+vec_i]
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
|
||||
%ifidn PS,8 ; 64-bit code
|
||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
|
||||
|
||||
vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
|
||||
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%endif
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
%ifidn PS,4 ; 32-bit code
|
||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
|
||||
%endif
|
||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
vpxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
%ifidn PS,4 ; 32-bit code
|
||||
sal vec, 1
|
||||
vmovdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
|
||||
sar vec, 1
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%endif
|
||||
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
vpxor xp3, xgft3_hi ;xp3 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
SLDR dest1, dest1_m
|
||||
SLDR dest2, dest2_m
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
SLDR dest3, dest3_m
|
||||
XSTR [dest3+pos], xp3
|
||||
|
||||
SLDR len, len_m
|
||||
add pos, 32 ;Loop on 32 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop32
|
||||
|
||||
lea tmp, [len + 32]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop32 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_3vect_dot_prod_avx2, 04, 05, 0197
|
378
erasure_code/gf_3vect_dot_prod_sse.asm
Normal file
378
erasure_code/gf_3vect_dot_prod_sse.asm
Normal file
@ -0,0 +1,378 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_3vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 6*16 + 5*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
save_xmm128 xmm6, 0*16
|
||||
save_xmm128 xmm7, 1*16
|
||||
save_xmm128 xmm8, 2*16
|
||||
save_xmm128 xmm9, 3*16
|
||||
save_xmm128 xmm10, 4*16
|
||||
save_xmm128 xmm11, 5*16
|
||||
save_reg r12, 6*16 + 0*8
|
||||
save_reg r13, 6*16 + 1*8
|
||||
save_reg r14, 6*16 + 2*8
|
||||
save_reg r15, 6*16 + 3*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp + 0*16]
|
||||
movdqa xmm7, [rsp + 1*16]
|
||||
movdqa xmm8, [rsp + 2*16]
|
||||
movdqa xmm9, [rsp + 3*16]
|
||||
movdqa xmm10, [rsp + 4*16]
|
||||
movdqa xmm11, [rsp + 5*16]
|
||||
mov r12, [rsp + 6*16 + 0*8]
|
||||
mov r13, [rsp + 6*16 + 1*8]
|
||||
mov r14, [rsp + 6*16 + 2*8]
|
||||
mov r15, [rsp + 6*16 + 3*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
|
||||
;;;================== High Address;
|
||||
;;; arg4
|
||||
;;; arg3
|
||||
;;; arg2
|
||||
;;; arg1
|
||||
;;; arg0
|
||||
;;; return
|
||||
;;;<================= esp of caller
|
||||
;;; ebp
|
||||
;;;<================= ebp = esp
|
||||
;;; var0
|
||||
;;; var1
|
||||
;;; esi
|
||||
;;; edi
|
||||
;;; ebx
|
||||
;;;<================= esp of callee
|
||||
;;;
|
||||
;;;================== Low Address;
|
||||
|
||||
%define PS 4
|
||||
%define LOG_PS 2
|
||||
%define func(x) x:
|
||||
%define arg(x) [ebp + PS*2 + PS*x]
|
||||
%define var(x) [ebp - PS - PS*x]
|
||||
|
||||
%define trans ecx
|
||||
%define trans2 esi
|
||||
%define arg0 trans ;trans and trans2 are for the variables in stack
|
||||
%define arg0_m arg(0)
|
||||
%define arg1 ebx
|
||||
%define arg2 arg2_m
|
||||
%define arg2_m arg(2)
|
||||
%define arg3 trans
|
||||
%define arg3_m arg(3)
|
||||
%define arg4 trans
|
||||
%define arg4_m arg(4)
|
||||
%define arg5 trans2
|
||||
%define tmp edx
|
||||
%define tmp2 edi
|
||||
%define tmp3 trans2
|
||||
%define tmp3_m var(0)
|
||||
%define tmp4 trans2
|
||||
%define tmp4_m var(1)
|
||||
%define return eax
|
||||
%macro SLDR 2 ;; stack load/restore
|
||||
mov %1, %2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
sub esp, PS*2 ;2 local variables
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
mov arg1, arg(1)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
add esp, PS*2 ;2 local variables
|
||||
pop ebp
|
||||
%endmacro
|
||||
|
||||
%endif ; output formats
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
%define ptr arg5
|
||||
|
||||
%define vec_i tmp2
|
||||
%define dest2 tmp3
|
||||
%define dest3 tmp4
|
||||
%define pos return
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
%define len_m arg0_m
|
||||
%define src_m arg3_m
|
||||
%define dest1_m arg4_m
|
||||
%define dest2_m tmp3_m
|
||||
%define dest3_m tmp4_m
|
||||
%endif
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR movdqu
|
||||
%define XSTR movdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR movdqa
|
||||
%define XSTR movdqa
|
||||
%else
|
||||
%define XLDR movntdqa
|
||||
%define XSTR movntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%ifidn PS,8 ; 64-bit code
|
||||
default rel
|
||||
[bits 64]
|
||||
%endif
|
||||
|
||||
|
||||
section .text
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
%define xmask0f xmm11
|
||||
%define xgft1_lo xmm2
|
||||
%define xgft1_hi xmm3
|
||||
%define xgft2_lo xmm4
|
||||
%define xgft2_hi xmm7
|
||||
%define xgft3_lo xmm6
|
||||
%define xgft3_hi xmm5
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm10
|
||||
%define xp2 xmm9
|
||||
%define xp3 xmm8
|
||||
%else
|
||||
%define xmask0f xmm7
|
||||
%define xgft1_lo xmm6
|
||||
%define xgft1_hi xmm5
|
||||
%define xgft2_lo xgft1_lo
|
||||
%define xgft2_hi xgft1_hi
|
||||
%define xgft3_lo xgft1_lo
|
||||
%define xgft3_hi xgft1_hi
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm2
|
||||
%define xp2 xmm3
|
||||
%define xp3 xmm4
|
||||
%endif
|
||||
|
||||
align 16
|
||||
global gf_3vect_dot_prod_sse:function
|
||||
func(gf_3vect_dot_prod_sse)
|
||||
FUNC_SAVE
|
||||
SLDR len, len_m
|
||||
sub len, 16
|
||||
SSTR len_m, len
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
SLDR dest1, dest1_m
|
||||
mov dest2, [dest1+PS]
|
||||
SSTR dest2_m, dest2
|
||||
mov dest3, [dest1+2*PS]
|
||||
SSTR dest3_m, dest3
|
||||
mov dest1, [dest1]
|
||||
SSTR dest1_m, dest1
|
||||
|
||||
.loop16:
|
||||
pxor xp1, xp1
|
||||
pxor xp2, xp2
|
||||
pxor xp3, xp3
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
.next_vect:
|
||||
SLDR src, src_m
|
||||
mov ptr, [src+vec_i]
|
||||
|
||||
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
%ifidn PS,8 ;64-bit code
|
||||
movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
movdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
movdqu xgft3_hi, [tmp+vec*(64/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%endif
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
pxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
%endif
|
||||
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
pxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
sal vec, 1
|
||||
movdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
movdqu xgft3_hi, [tmp+vec*(32/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
sar vec, 1
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%endif
|
||||
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
pxor xp3, xgft3_hi ;xp3 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
SLDR dest1, dest1_m
|
||||
SLDR dest2, dest2_m
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
SLDR dest3, dest3_m
|
||||
XSTR [dest3+pos], xp3
|
||||
|
||||
SLDR len, len_m
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop16 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_3vect_dot_prod_sse, 00, 06, 0063
|
246
erasure_code/gf_3vect_dot_prod_sse_perf.c
Normal file
246
erasure_code/gf_3vect_dot_prod_sse_perf.c
Normal file
@ -0,0 +1,246 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_3vect_dot_prod_sse
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 10
|
||||
# define TEST_LEN 8*1024
|
||||
# define TEST_LOOPS 40000
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 10
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
|
||||
# define TEST_LOOPS 100
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j;
|
||||
void *buf;
|
||||
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
|
||||
u8 g_tbls[3 * TEST_SOURCES * 32], *dest_ptrs[3], *buffs[TEST_SOURCES];
|
||||
u8 *dest1, *dest2, *dest3, *dest_ref1, *dest_ref2, *dest_ref3;
|
||||
struct perf start, stop;
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref3 = buf;
|
||||
|
||||
dest_ptrs[0] = dest1;
|
||||
dest_ptrs[1] = dest2;
|
||||
dest_ptrs[2] = dest3;
|
||||
|
||||
// Performance test
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
memset(dest1, 0, TEST_LEN);
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest_ref1, 0, TEST_LEN);
|
||||
memset(dest_ref2, 0, TEST_LEN);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
}
|
||||
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
|
||||
dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
|
||||
dest_ref3);
|
||||
|
||||
#ifdef DO_REF_PERF
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS / 100; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
|
||||
buffs, dest_ref3);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("gf_3vect_dot_prod_base" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 3) * i);
|
||||
#endif
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 3) * i);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("pass perf check\n");
|
||||
return 0;
|
||||
|
||||
}
|
583
erasure_code/gf_3vect_dot_prod_sse_test.c
Normal file
583
erasure_code/gf_3vect_dot_prod_sse_test.c
Normal file
@ -0,0 +1,583 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_3vect_dot_prod_sse
|
||||
#endif
|
||||
#ifndef TEST_MIN_SIZE
|
||||
# define TEST_MIN_SIZE 16
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
#define TEST_MEM TEST_SIZE
|
||||
#define TEST_LOOPS 10000
|
||||
#define TEST_TYPE_STR ""
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 16
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 20
|
||||
#endif
|
||||
|
||||
#ifdef EC_ALIGNED_ADDR
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 0
|
||||
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
|
||||
#else
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 32
|
||||
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, srcs;
|
||||
void *buf;
|
||||
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
|
||||
u8 g_tbls[3 * TEST_SOURCES * 32], *dest_ptrs[3], *buffs[TEST_SOURCES];
|
||||
u8 *dest1, *dest2, *dest3, *dest_ref1, *dest_ref2, *dest_ref3;
|
||||
|
||||
int align, size;
|
||||
unsigned char *efence_buffs[TEST_SOURCES];
|
||||
unsigned int offset;
|
||||
u8 *ubuffs[TEST_SOURCES];
|
||||
u8 *udest_ptrs[3];
|
||||
printf(xstr(FUNCTION_UNDER_TEST) "_test: %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");;
|
||||
return -1;
|
||||
}
|
||||
dest_ref2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref3 = buf;
|
||||
|
||||
dest_ptrs[0] = dest1;
|
||||
dest_ptrs[1] = dest2;
|
||||
dest_ptrs[2] = dest3;
|
||||
|
||||
// Test of all zeros
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
memset(buffs[i], 0, TEST_LEN);
|
||||
|
||||
memset(dest1, 0, TEST_LEN);
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest3, 0, TEST_LEN);
|
||||
memset(dest_ref1, 0, TEST_LEN);
|
||||
memset(dest_ref2, 0, TEST_LEN);
|
||||
memset(dest_ref3, 0, TEST_LEN);
|
||||
memset(g1, 2, TEST_SOURCES);
|
||||
memset(g2, 1, TEST_SOURCES);
|
||||
memset(g3, 7, TEST_SOURCES);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
|
||||
dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
|
||||
dest_ref3);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail zero" xstr(FUNCTION_UNDER_TEST) " test1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
|
||||
// Rand data test
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
|
||||
buffs, dest_ref3);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Rand data test with varied parameters
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
|
||||
dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
|
||||
dest_ref3);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test1 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test2 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test3 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
|
||||
efence_buffs[i] = buffs[i] + TEST_LEN - size;
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref3);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref2, dest2, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref3, dest3, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test rand ptr alignment if available
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
|
||||
srcs = rand() % TEST_SOURCES;
|
||||
if (srcs == 0)
|
||||
continue;
|
||||
|
||||
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
|
||||
// Add random offsets
|
||||
for (i = 0; i < srcs; i++)
|
||||
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
memset(dest1, 0, TEST_LEN); // zero pad to check write-over
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest3, 0, TEST_LEN);
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
ubuffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
|
||||
|
||||
if (memcmp(dest_ref1, udest_ptrs[0], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[0], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref2, udest_ptrs[1], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[1], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref3, udest_ptrs[2], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[2], 25);
|
||||
return -1;
|
||||
}
|
||||
// Confirm that padding around dests is unchanged
|
||||
memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
|
||||
offset = udest_ptrs[0] - dest1;
|
||||
|
||||
if (memcmp(dest1, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad1 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad1 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[1] - dest2;
|
||||
if (memcmp(dest2, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad2 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad2 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[2] - dest3;
|
||||
if (memcmp(dest3, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad3 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad3 end\n");;
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test all size alignment
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
|
||||
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
|
||||
srcs = TEST_SOURCES;
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (memcmp(dest_ref1, dest_ptrs[0], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[0], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref2, dest_ptrs[1], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[1], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref3, dest_ptrs[2], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[2], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("Pass\n");
|
||||
return 0;
|
||||
|
||||
}
|
288
erasure_code/gf_3vect_mad_avx.asm
Normal file
288
erasure_code/gf_3vect_mad_avx.asm
Normal file
@ -0,0 +1,288 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*10 + 3*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
vmovdqa [rsp+16*0],xmm6
|
||||
vmovdqa [rsp+16*1],xmm7
|
||||
vmovdqa [rsp+16*2],xmm8
|
||||
vmovdqa [rsp+16*3],xmm9
|
||||
vmovdqa [rsp+16*4],xmm10
|
||||
vmovdqa [rsp+16*5],xmm11
|
||||
vmovdqa [rsp+16*6],xmm12
|
||||
vmovdqa [rsp+16*7],xmm13
|
||||
vmovdqa [rsp+16*8],xmm14
|
||||
vmovdqa [rsp+16*9],xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r15, 10*16 + 1*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp+16*0]
|
||||
vmovdqa xmm7, [rsp+16*1]
|
||||
vmovdqa xmm8, [rsp+16*2]
|
||||
vmovdqa xmm9, [rsp+16*3]
|
||||
vmovdqa xmm10, [rsp+16*4]
|
||||
vmovdqa xmm11, [rsp+16*5]
|
||||
vmovdqa xmm12, [rsp+16*6]
|
||||
vmovdqa xmm13, [rsp+16*7]
|
||||
vmovdqa xmm14, [rsp+16*8]
|
||||
vmovdqa xmm15, [rsp+16*9]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r15, [rsp + 10*16 + 1*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 mul_array
|
||||
%define dest3 vec_i
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft1_lo xmm14
|
||||
%define xgft1_hi xmm13
|
||||
%define xgft2_lo xmm12
|
||||
%define xgft2_hi xmm11
|
||||
%define xgft3_lo xmm10
|
||||
%define xgft3_hi xmm9
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xtmph1 xmm2
|
||||
%define xtmpl1 xmm3
|
||||
%define xtmph2 xmm4
|
||||
%define xtmpl2 xmm5
|
||||
%define xtmph3 xmm6
|
||||
%define xtmpl3 xmm7
|
||||
%define xd1 xmm8
|
||||
%define xd2 xtmpl1
|
||||
%define xd3 xtmph1
|
||||
|
||||
align 16
|
||||
global gf_3vect_mad_avx:function
|
||||
func(gf_3vect_mad_avx)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
sal vec, 5
|
||||
lea tmp, [mul_array + vec_i]
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
vmovdqu xgft2_hi, [tmp+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
vmovdqu xgft3_hi, [tmp+2*vec+16]; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
mov dest2, [dest1+PS] ; reuse mul_array
|
||||
mov dest3, [dest1+2*PS] ; reuse vec_i
|
||||
mov dest1, [dest1]
|
||||
|
||||
.loop16:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
|
||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
||||
|
||||
XLDR xd2, [dest2+pos] ;reuse xtmpl1. Get next dest vector
|
||||
XLDR xd3, [dest3+pos] ;reuse xtmph1. Get next dest vector
|
||||
|
||||
; dest2
|
||||
vpshufb xtmph2, xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl2, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
|
||||
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
||||
|
||||
; dest3
|
||||
vpshufb xtmph3, xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl3, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials
|
||||
vpxor xd3, xd3, xtmph3 ;xd3 += partial
|
||||
|
||||
XSTR [dest1+pos], xd1
|
||||
XSTR [dest2+pos], xd2
|
||||
XSTR [dest3+pos], xd3
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
.lessthan16:
|
||||
;; Tail len
|
||||
;; Do one more overlap pass
|
||||
mov tmp, len ;Overlapped offset length-16
|
||||
XLDR x0, [src+tmp] ;Get next source vector
|
||||
XLDR xd1, [dest1+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest2+tmp] ;reuse xtmpl1. Get next dest vector
|
||||
XLDR xd3, [dest3+tmp] ;reuse xtmph1. Get next dest vector
|
||||
|
||||
sub len, pos
|
||||
|
||||
movdqa xtmph3, [constip16] ;Load const of i + 16
|
||||
vpinsrb xtmpl3, xtmpl3, len.w, 15
|
||||
vpshufb xtmpl3, xtmpl3, xmask0f ;Broadcast len to all bytes
|
||||
vpcmpgtb xtmpl3, xtmpl3, xtmph3
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
vpshufb xgft1_hi, xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpand xgft1_hi, xgft1_hi, xtmpl3
|
||||
vpxor xd1, xd1, xgft1_hi
|
||||
|
||||
; dest2
|
||||
vpshufb xgft2_hi, xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft2_hi, xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
vpand xgft2_hi, xgft2_hi, xtmpl3
|
||||
vpxor xd2, xd2, xgft2_hi
|
||||
|
||||
; dest3
|
||||
vpshufb xgft3_hi, xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft3_hi, xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
vpand xgft3_hi, xgft3_hi, xtmpl3
|
||||
vpxor xd3, xd3, xgft3_hi
|
||||
|
||||
XSTR [dest1+tmp], xd1
|
||||
XSTR [dest2+tmp], xd2
|
||||
XSTR [dest3+tmp], xd3
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
constip16:
|
||||
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_3vect_mad_avx, 02, 01, 0207
|
317
erasure_code/gf_3vect_mad_avx2.asm
Normal file
317
erasure_code/gf_3vect_mad_avx2.asm
Normal file
@ -0,0 +1,317 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_3vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*10 + 3*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
vmovdqa [rsp+16*0],xmm6
|
||||
vmovdqa [rsp+16*1],xmm7
|
||||
vmovdqa [rsp+16*2],xmm8
|
||||
vmovdqa [rsp+16*3],xmm9
|
||||
vmovdqa [rsp+16*4],xmm10
|
||||
vmovdqa [rsp+16*5],xmm11
|
||||
vmovdqa [rsp+16*6],xmm12
|
||||
vmovdqa [rsp+16*7],xmm13
|
||||
vmovdqa [rsp+16*8],xmm14
|
||||
vmovdqa [rsp+16*9],xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r15, 10*16 + 1*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp+16*0]
|
||||
vmovdqa xmm7, [rsp+16*1]
|
||||
vmovdqa xmm8, [rsp+16*2]
|
||||
vmovdqa xmm9, [rsp+16*3]
|
||||
vmovdqa xmm10, [rsp+16*4]
|
||||
vmovdqa xmm11, [rsp+16*5]
|
||||
vmovdqa xmm12, [rsp+16*6]
|
||||
vmovdqa xmm13, [rsp+16*7]
|
||||
vmovdqa xmm14, [rsp+16*8]
|
||||
vmovdqa xmm15, [rsp+16*9]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r15, [rsp + 10*16 + 1*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
;;; gf_3vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 mul_array
|
||||
%define dest3 vec_i
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f ymm15
|
||||
%define xmask0fx xmm15
|
||||
%define xgft1_lo ymm14
|
||||
%define xgft1_hi ymm13
|
||||
%define xgft2_lo ymm12
|
||||
%define xgft3_lo ymm11
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xtmph1 ymm2
|
||||
%define xtmpl1 ymm3
|
||||
%define xtmph2 ymm4
|
||||
%define xtmpl2 ymm5
|
||||
%define xtmpl2x xmm5
|
||||
%define xtmph3 ymm6
|
||||
%define xtmpl3 ymm7
|
||||
%define xtmpl3x xmm7
|
||||
%define xd1 ymm8
|
||||
%define xd2 ymm9
|
||||
%define xd3 ymm10
|
||||
|
||||
align 16
|
||||
global gf_3vect_mad_avx2:function
|
||||
func(gf_3vect_mad_avx2)
|
||||
FUNC_SAVE
|
||||
sub len, 32
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
mov tmp.b, 0x0f
|
||||
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
|
||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
sal vec, 5
|
||||
lea tmp, [mul_array + vec_i]
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
|
||||
|
||||
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
mov dest2, [dest1+PS] ; reuse mul_array
|
||||
mov dest3, [dest1+2*PS] ; reuse vec_i
|
||||
mov dest1, [dest1]
|
||||
|
||||
.loop32:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
XLDR xd2, [dest2+pos] ;Get next dest vector
|
||||
XLDR xd3, [dest3+pos] ;Get next dest vector
|
||||
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xtmpl2, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
|
||||
|
||||
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xtmpl3, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
|
||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
||||
|
||||
; dest2
|
||||
vpshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmpl2 ;GF add high and low partials
|
||||
vpxor xd2, xtmph2 ;xd2 += partial
|
||||
|
||||
; dest3
|
||||
vpshufb xtmph3, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph3, xtmpl3 ;GF add high and low partials
|
||||
vpxor xd3, xtmph3 ;xd3 += partial
|
||||
|
||||
XSTR [dest1+pos], xd1
|
||||
XSTR [dest2+pos], xd2
|
||||
XSTR [dest3+pos], xd3
|
||||
|
||||
add pos, 32 ;Loop on 32 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop32
|
||||
|
||||
lea tmp, [len + 32]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
.lessthan32:
|
||||
;; Tail len
|
||||
;; Do one more overlap pass
|
||||
mov tmp.b, 0x1f
|
||||
vpinsrb xtmpl2x, xtmpl2x, tmp.w, 0
|
||||
vpbroadcastb xtmpl2, xtmpl2x ;Construct mask 0x1f1f1f...
|
||||
|
||||
mov tmp, len ;Overlapped offset length-32
|
||||
|
||||
XLDR x0, [src+tmp] ;Get next source vector
|
||||
XLDR xd1, [dest1+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest2+tmp] ;Get next dest vector
|
||||
XLDR xd3, [dest3+tmp] ;Get next dest vector
|
||||
|
||||
sub len, pos
|
||||
|
||||
vmovdqa xtmph3, [constip32] ;Load const of i + 32
|
||||
vpinsrb xtmpl3x, xtmpl3x, len.w, 15
|
||||
vinserti128 xtmpl3, xtmpl3, xtmpl3x, 1 ;swapped to xtmpl3x | xtmpl3x
|
||||
vpshufb xtmpl3, xtmpl3, xtmpl2 ;Broadcast len to all bytes. xtmpl2=0x1f1f1f...
|
||||
vpcmpgtb xtmpl3, xtmpl3, xtmph3
|
||||
|
||||
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
|
||||
|
||||
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
|
||||
vpand xtmph1, xtmph1, xtmpl3
|
||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
||||
|
||||
; dest2
|
||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xgft2_lo ;GF add high and low partials
|
||||
vpand xtmph2, xtmph2, xtmpl3
|
||||
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
||||
|
||||
; dest3
|
||||
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph3, xtmph3, xgft3_lo ;GF add high and low partials
|
||||
vpand xtmph3, xtmph3, xtmpl3
|
||||
vpxor xd3, xd3, xtmph3 ;xd3 += partial
|
||||
|
||||
XSTR [dest1+tmp], xd1
|
||||
XSTR [dest2+tmp], xd2
|
||||
XSTR [dest3+tmp], xd3
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 32
|
||||
constip32:
|
||||
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
|
||||
ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_3vect_mad_avx2, 04, 01, 0208
|
298
erasure_code/gf_3vect_mad_sse.asm
Normal file
298
erasure_code/gf_3vect_mad_sse.asm
Normal file
@ -0,0 +1,298 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_3vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*10 + 3*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
movdqa [rsp+16*0],xmm6
|
||||
movdqa [rsp+16*1],xmm7
|
||||
movdqa [rsp+16*2],xmm8
|
||||
movdqa [rsp+16*3],xmm9
|
||||
movdqa [rsp+16*4],xmm10
|
||||
movdqa [rsp+16*5],xmm11
|
||||
movdqa [rsp+16*6],xmm12
|
||||
movdqa [rsp+16*7],xmm13
|
||||
movdqa [rsp+16*8],xmm14
|
||||
movdqa [rsp+16*9],xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r15, 10*16 + 1*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp+16*0]
|
||||
movdqa xmm7, [rsp+16*1]
|
||||
movdqa xmm8, [rsp+16*2]
|
||||
movdqa xmm9, [rsp+16*3]
|
||||
movdqa xmm10, [rsp+16*4]
|
||||
movdqa xmm11, [rsp+16*5]
|
||||
movdqa xmm12, [rsp+16*6]
|
||||
movdqa xmm13, [rsp+16*7]
|
||||
movdqa xmm14, [rsp+16*8]
|
||||
movdqa xmm15, [rsp+16*9]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r15, [rsp + 10*16 + 1*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
;;; gf_3vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 mul_array
|
||||
%define dest3 vec_i
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR movdqu
|
||||
%define XSTR movdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR movdqa
|
||||
%define XSTR movdqa
|
||||
%else
|
||||
%define XLDR movntdqa
|
||||
%define XSTR movntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft1_lo xmm14
|
||||
%define xgft1_hi xmm13
|
||||
%define xgft2_lo xmm12
|
||||
%define xgft2_hi xmm11
|
||||
%define xgft3_lo xmm10
|
||||
%define xgft3_hi xmm9
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xtmph1 xmm2
|
||||
%define xtmpl1 xmm3
|
||||
%define xtmph2 xmm4
|
||||
%define xtmpl2 xmm5
|
||||
%define xtmph3 xmm6
|
||||
%define xtmpl3 xmm7
|
||||
%define xd1 xmm8
|
||||
%define xd2 xtmpl1
|
||||
%define xd3 xtmph1
|
||||
|
||||
align 16
|
||||
global gf_3vect_mad_sse:function
|
||||
func(gf_3vect_mad_sse)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
sal vec, 5
|
||||
lea tmp, [mul_array + vec_i]
|
||||
|
||||
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
movdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
movdqu xgft2_hi, [tmp+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
movdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
movdqu xgft3_hi, [tmp+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
mov dest2, [dest1+PS] ; reuse mul_array
|
||||
mov dest3, [dest1+2*PS] ; reuse vec_i
|
||||
mov dest1, [dest1]
|
||||
|
||||
.loop16:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
movdqa xtmph1, xgft1_hi ;Reload const array registers
|
||||
movdqa xtmpl1, xgft1_lo
|
||||
movdqa xtmph2, xgft2_hi ;Reload const array registers
|
||||
movdqa xtmpl2, xgft2_lo
|
||||
movdqa xtmph3, xgft3_hi ;Reload const array registers
|
||||
movdqa xtmpl3, xgft3_lo
|
||||
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
pshufb xtmph1, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph1, xtmpl1 ;GF add high and low partials
|
||||
pxor xd1, xtmph1
|
||||
|
||||
XLDR xd2, [dest2+pos] ;reuse xtmpl1. Get next dest vector
|
||||
XLDR xd3, [dest3+pos] ;reuse xtmph1. Get next dest vector
|
||||
|
||||
; dest2
|
||||
pshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph2, xtmpl2 ;GF add high and low partials
|
||||
pxor xd2, xtmph2
|
||||
|
||||
; dest3
|
||||
pshufb xtmph3, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph3, xtmpl3 ;GF add high and low partials
|
||||
pxor xd3, xtmph3
|
||||
|
||||
XSTR [dest1+pos], xd1 ;Store result
|
||||
XSTR [dest2+pos], xd2 ;Store result
|
||||
XSTR [dest3+pos], xd3 ;Store result
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
.lessthan16:
|
||||
;; Tail len
|
||||
;; Do one more overlap pass
|
||||
mov tmp, len ;Overlapped offset length-16
|
||||
|
||||
XLDR x0, [src+tmp] ;Get next source vector
|
||||
XLDR xd1, [dest1+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest2+tmp] ;reuse xtmpl1. Get next dest vector
|
||||
XLDR xd3, [dest3+tmp] ;reuse xtmph1. Get next dest vector
|
||||
|
||||
sub len, pos
|
||||
|
||||
movdqa xtmph3, [constip16] ;Load const of i + 16
|
||||
pinsrb xtmpl3, len.w, 15
|
||||
pshufb xtmpl3, xmask0f ;Broadcast len to all bytes
|
||||
pcmpgtb xtmpl3, xtmph3
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
pand xgft1_hi, xtmpl3
|
||||
pxor xd1, xgft1_hi
|
||||
|
||||
; dest2
|
||||
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
pand xgft2_hi, xtmpl3
|
||||
pxor xd2, xgft2_hi
|
||||
|
||||
; dest3
|
||||
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
pand xgft3_hi, xtmpl3
|
||||
pxor xd3, xgft3_hi
|
||||
|
||||
XSTR [dest1+tmp], xd1 ;Store result
|
||||
XSTR [dest2+tmp], xd2 ;Store result
|
||||
XSTR [dest3+tmp], xd3 ;Store result
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
|
||||
mask0f:
|
||||
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
constip16:
|
||||
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_3vect_mad_sse, 00, 01, 0206
|
441
erasure_code/gf_4vect_dot_prod_avx.asm
Normal file
441
erasure_code/gf_4vect_dot_prod_avx.asm
Normal file
@ -0,0 +1,441 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_4vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define tmp5 r14 ; must be saved and restored
|
||||
%define tmp6 r15 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define tmp5 rdi ; must be saved and restored
|
||||
%define tmp6 rsi ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 9*16 + 7*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
save_xmm128 xmm6, 0*16
|
||||
save_xmm128 xmm7, 1*16
|
||||
save_xmm128 xmm8, 2*16
|
||||
save_xmm128 xmm9, 3*16
|
||||
save_xmm128 xmm10, 4*16
|
||||
save_xmm128 xmm11, 5*16
|
||||
save_xmm128 xmm12, 6*16
|
||||
save_xmm128 xmm13, 7*16
|
||||
save_xmm128 xmm14, 8*16
|
||||
save_reg r12, 9*16 + 0*8
|
||||
save_reg r13, 9*16 + 1*8
|
||||
save_reg r14, 9*16 + 2*8
|
||||
save_reg r15, 9*16 + 3*8
|
||||
save_reg rdi, 9*16 + 4*8
|
||||
save_reg rsi, 9*16 + 5*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
vmovdqa xmm9, [rsp + 3*16]
|
||||
vmovdqa xmm10, [rsp + 4*16]
|
||||
vmovdqa xmm11, [rsp + 5*16]
|
||||
vmovdqa xmm12, [rsp + 6*16]
|
||||
vmovdqa xmm13, [rsp + 7*16]
|
||||
vmovdqa xmm14, [rsp + 8*16]
|
||||
mov r12, [rsp + 9*16 + 0*8]
|
||||
mov r13, [rsp + 9*16 + 1*8]
|
||||
mov r14, [rsp + 9*16 + 2*8]
|
||||
mov r15, [rsp + 9*16 + 3*8]
|
||||
mov rdi, [rsp + 9*16 + 4*8]
|
||||
mov rsi, [rsp + 9*16 + 5*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
|
||||
;;;================== High Address;
|
||||
;;; arg4
|
||||
;;; arg3
|
||||
;;; arg2
|
||||
;;; arg1
|
||||
;;; arg0
|
||||
;;; return
|
||||
;;;<================= esp of caller
|
||||
;;; ebp
|
||||
;;;<================= ebp = esp
|
||||
;;; var0
|
||||
;;; var1
|
||||
;;; var2
|
||||
;;; var3
|
||||
;;; esi
|
||||
;;; edi
|
||||
;;; ebx
|
||||
;;;<================= esp of callee
|
||||
;;;
|
||||
;;;================== Low Address;
|
||||
|
||||
%define PS 4
|
||||
%define LOG_PS 2
|
||||
%define func(x) x:
|
||||
%define arg(x) [ebp + PS*2 + PS*x]
|
||||
%define var(x) [ebp - PS - PS*x]
|
||||
|
||||
%define trans ecx
|
||||
%define trans2 esi
|
||||
%define arg0 trans ;trans and trans2 are for the variables in stack
|
||||
%define arg0_m arg(0)
|
||||
%define arg1 ebx
|
||||
%define arg2 arg2_m
|
||||
%define arg2_m arg(2)
|
||||
%define arg3 trans
|
||||
%define arg3_m arg(3)
|
||||
%define arg4 trans
|
||||
%define arg4_m arg(4)
|
||||
%define arg5 trans2
|
||||
%define tmp edx
|
||||
%define tmp2 edi
|
||||
%define tmp3 trans2
|
||||
%define tmp3_m var(0)
|
||||
%define tmp4 trans2
|
||||
%define tmp4_m var(1)
|
||||
%define tmp5 trans2
|
||||
%define tmp5_m var(2)
|
||||
%define tmp6 trans2
|
||||
%define tmp6_m var(3)
|
||||
%define return eax
|
||||
%macro SLDR 2 ;stack load/restore
|
||||
mov %1, %2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
sub esp, PS*4 ;4 local variables
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
mov arg1, arg(1)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
add esp, PS*4 ;4 local variables
|
||||
pop ebp
|
||||
%endmacro
|
||||
|
||||
%endif ; output formats
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest2 tmp3
|
||||
%define dest3 tmp4
|
||||
%define dest4 tmp5
|
||||
%define vskip3 tmp6
|
||||
%define pos return
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
%define len_m arg0_m
|
||||
%define src_m arg3_m
|
||||
%define dest1_m arg4_m
|
||||
%define dest2_m tmp3_m
|
||||
%define dest3_m tmp4_m
|
||||
%define dest4_m tmp5_m
|
||||
%define vskip3_m tmp6_m
|
||||
%endif
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%ifidn PS,8 ; 64-bit code
|
||||
default rel
|
||||
[bits 64]
|
||||
%endif
|
||||
|
||||
|
||||
section .text
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
%define xmask0f xmm14
|
||||
%define xgft1_lo xmm13
|
||||
%define xgft1_hi xmm12
|
||||
%define xgft2_lo xmm11
|
||||
%define xgft2_hi xmm10
|
||||
%define xgft3_lo xmm9
|
||||
%define xgft3_hi xmm8
|
||||
%define xgft4_lo xmm7
|
||||
%define xgft4_hi xmm6
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm2
|
||||
%define xp2 xmm3
|
||||
%define xp3 xmm4
|
||||
%define xp4 xmm5
|
||||
%else
|
||||
%define xmm_trans xmm7 ;reuse xmask0f and xgft1_lo
|
||||
%define xmask0f xmm_trans
|
||||
%define xgft1_lo xmm_trans
|
||||
%define xgft1_hi xmm6
|
||||
%define xgft2_lo xgft1_lo
|
||||
%define xgft2_hi xgft1_hi
|
||||
%define xgft3_lo xgft1_lo
|
||||
%define xgft3_hi xgft1_hi
|
||||
%define xgft4_lo xgft1_lo
|
||||
%define xgft4_hi xgft1_hi
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm2
|
||||
%define xp2 xmm3
|
||||
%define xp3 xmm4
|
||||
%define xp4 xmm5
|
||||
%endif
|
||||
align 16
|
||||
global gf_4vect_dot_prod_avx:function
|
||||
func(gf_4vect_dot_prod_avx)
|
||||
FUNC_SAVE
|
||||
SLDR len, len_m
|
||||
sub len, 16
|
||||
SSTR len_m, len
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
mov vskip3, vec
|
||||
imul vskip3, 96
|
||||
SSTR vskip3_m, vskip3
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
SLDR dest1, dest1_m
|
||||
mov dest2, [dest1+PS]
|
||||
SSTR dest2_m, dest2
|
||||
mov dest3, [dest1+2*PS]
|
||||
SSTR dest3_m, dest3
|
||||
mov dest4, [dest1+3*PS]
|
||||
SSTR dest4_m, dest4
|
||||
mov dest1, [dest1]
|
||||
SSTR dest1_m, dest1
|
||||
|
||||
.loop16:
|
||||
vpxor xp1, xp1
|
||||
vpxor xp2, xp2
|
||||
vpxor xp3, xp3
|
||||
vpxor xp4, xp4
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
.next_vect:
|
||||
SLDR src, src_m
|
||||
mov ptr, [src+vec_i]
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
vmovdqu xgft3_hi, [tmp+vec*(64/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
vmovdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
|
||||
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
%else ;32-bit code
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
%endif
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
%endif
|
||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
vpxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
sal vec, 1
|
||||
vmovdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
vmovdqu xgft3_hi, [tmp+vec*(32/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
sar vec, 1
|
||||
%endif
|
||||
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
vpxor xp3, xgft3_hi ;xp3 += partial
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
SLDR vskip3, vskip3_m
|
||||
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
vmovdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%endif
|
||||
vpshufb xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
vpxor xp4, xgft4_hi ;xp4 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
SLDR dest1, dest1_m
|
||||
SLDR dest2, dest2_m
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
SLDR dest3, dest3_m
|
||||
XSTR [dest3+pos], xp3
|
||||
SLDR dest4, dest4_m
|
||||
XSTR [dest4+pos], xp4
|
||||
|
||||
SLDR len, len_m
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop16 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_4vect_dot_prod_avx, 02, 05, 0193
|
460
erasure_code/gf_4vect_dot_prod_avx2.asm
Normal file
460
erasure_code/gf_4vect_dot_prod_avx2.asm
Normal file
@ -0,0 +1,460 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_4vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define tmp5 r14 ; must be saved and restored
|
||||
%define tmp6 r15 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define tmp5 rdi ; must be saved and restored
|
||||
%define tmp6 rsi ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 9*16 + 7*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
vmovdqa [rsp + 0*16], xmm6
|
||||
vmovdqa [rsp + 1*16], xmm7
|
||||
vmovdqa [rsp + 2*16], xmm8
|
||||
vmovdqa [rsp + 3*16], xmm9
|
||||
vmovdqa [rsp + 4*16], xmm10
|
||||
vmovdqa [rsp + 5*16], xmm11
|
||||
vmovdqa [rsp + 6*16], xmm12
|
||||
vmovdqa [rsp + 7*16], xmm13
|
||||
vmovdqa [rsp + 8*16], xmm14
|
||||
save_reg r12, 9*16 + 0*8
|
||||
save_reg r13, 9*16 + 1*8
|
||||
save_reg r14, 9*16 + 2*8
|
||||
save_reg r15, 9*16 + 3*8
|
||||
save_reg rdi, 9*16 + 4*8
|
||||
save_reg rsi, 9*16 + 5*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
vmovdqa xmm9, [rsp + 3*16]
|
||||
vmovdqa xmm10, [rsp + 4*16]
|
||||
vmovdqa xmm11, [rsp + 5*16]
|
||||
vmovdqa xmm12, [rsp + 6*16]
|
||||
vmovdqa xmm13, [rsp + 7*16]
|
||||
vmovdqa xmm14, [rsp + 8*16]
|
||||
mov r12, [rsp + 9*16 + 0*8]
|
||||
mov r13, [rsp + 9*16 + 1*8]
|
||||
mov r14, [rsp + 9*16 + 2*8]
|
||||
mov r15, [rsp + 9*16 + 3*8]
|
||||
mov rdi, [rsp + 9*16 + 4*8]
|
||||
mov rsi, [rsp + 9*16 + 5*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
|
||||
;;;================== High Address;
|
||||
;;; arg4
|
||||
;;; arg3
|
||||
;;; arg2
|
||||
;;; arg1
|
||||
;;; arg0
|
||||
;;; return
|
||||
;;;<================= esp of caller
|
||||
;;; ebp
|
||||
;;;<================= ebp = esp
|
||||
;;; var0
|
||||
;;; var1
|
||||
;;; var2
|
||||
;;; var3
|
||||
;;; esi
|
||||
;;; edi
|
||||
;;; ebx
|
||||
;;;<================= esp of callee
|
||||
;;;
|
||||
;;;================== Low Address;
|
||||
|
||||
%define PS 4
|
||||
%define LOG_PS 2
|
||||
%define func(x) x:
|
||||
%define arg(x) [ebp + PS*2 + PS*x]
|
||||
%define var(x) [ebp - PS - PS*x]
|
||||
|
||||
%define trans ecx
|
||||
%define trans2 esi
|
||||
%define arg0 trans ;trans and trans2 are for the variables in stack
|
||||
%define arg0_m arg(0)
|
||||
%define arg1 ebx
|
||||
%define arg2 arg2_m
|
||||
%define arg2_m arg(2)
|
||||
%define arg3 trans
|
||||
%define arg3_m arg(3)
|
||||
%define arg4 trans
|
||||
%define arg4_m arg(4)
|
||||
%define arg5 trans2
|
||||
%define tmp edx
|
||||
%define tmp.w edx
|
||||
%define tmp.b dl
|
||||
%define tmp2 edi
|
||||
%define tmp3 trans2
|
||||
%define tmp3_m var(0)
|
||||
%define tmp4 trans2
|
||||
%define tmp4_m var(1)
|
||||
%define tmp5 trans2
|
||||
%define tmp5_m var(2)
|
||||
%define tmp6 trans2
|
||||
%define tmp6_m var(3)
|
||||
%define return eax
|
||||
%macro SLDR 2 ;stack load/restore
|
||||
mov %1, %2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
sub esp, PS*4 ;4 local variables
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
mov arg1, arg(1)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
add esp, PS*4 ;4 local variables
|
||||
pop ebp
|
||||
%endmacro
|
||||
|
||||
%endif ; output formats
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest2 tmp3
|
||||
%define dest3 tmp4
|
||||
%define dest4 tmp5
|
||||
%define vskip3 tmp6
|
||||
%define pos return
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
%define len_m arg0_m
|
||||
%define src_m arg3_m
|
||||
%define dest1_m arg4_m
|
||||
%define dest2_m tmp3_m
|
||||
%define dest3_m tmp4_m
|
||||
%define dest4_m tmp5_m
|
||||
%define vskip3_m tmp6_m
|
||||
%endif
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
default rel
|
||||
[bits 64]
|
||||
%endif
|
||||
|
||||
|
||||
section .text
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
%define xmask0f ymm14
|
||||
%define xmask0fx xmm14
|
||||
%define xgft1_lo ymm13
|
||||
%define xgft1_hi ymm12
|
||||
%define xgft2_lo ymm11
|
||||
%define xgft2_hi ymm10
|
||||
%define xgft3_lo ymm9
|
||||
%define xgft3_hi ymm8
|
||||
%define xgft4_lo ymm7
|
||||
%define xgft4_hi ymm6
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xp1 ymm2
|
||||
%define xp2 ymm3
|
||||
%define xp3 ymm4
|
||||
%define xp4 ymm5
|
||||
%else
|
||||
%define ymm_trans ymm7 ;reuse xmask0f and xgft1_hi
|
||||
%define xmask0f ymm_trans
|
||||
%define xmask0fx xmm7
|
||||
%define xgft1_lo ymm6
|
||||
%define xgft1_hi ymm_trans
|
||||
%define xgft2_lo xgft1_lo
|
||||
%define xgft2_hi xgft1_hi
|
||||
%define xgft3_lo xgft1_lo
|
||||
%define xgft3_hi xgft1_hi
|
||||
%define xgft4_lo xgft1_lo
|
||||
%define xgft4_hi xgft1_hi
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xp1 ymm2
|
||||
%define xp2 ymm3
|
||||
%define xp3 ymm4
|
||||
%define xp4 ymm5
|
||||
%endif
|
||||
align 16
|
||||
global gf_4vect_dot_prod_avx2:function
|
||||
func(gf_4vect_dot_prod_avx2)
|
||||
FUNC_SAVE
|
||||
SLDR len, len_m
|
||||
sub len, 32
|
||||
SSTR len_m, len
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
mov tmp.b, 0x0f
|
||||
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
|
||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||
mov vskip3, vec
|
||||
imul vskip3, 96
|
||||
SSTR vskip3_m, vskip3
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
SLDR dest1, dest1_m
|
||||
mov dest2, [dest1+PS]
|
||||
SSTR dest2_m, dest2
|
||||
mov dest3, [dest1+2*PS]
|
||||
SSTR dest3_m, dest3
|
||||
mov dest4, [dest1+3*PS]
|
||||
SSTR dest4_m, dest4
|
||||
mov dest1, [dest1]
|
||||
SSTR dest1_m, dest1
|
||||
|
||||
.loop32:
|
||||
vpxor xp1, xp1
|
||||
vpxor xp2, xp2
|
||||
vpxor xp3, xp3
|
||||
vpxor xp4, xp4
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
.next_vect:
|
||||
SLDR src, src_m
|
||||
mov ptr, [src+vec_i]
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
|
||||
add vec_i, PS
|
||||
%ifidn PS,8 ;64-bit code
|
||||
vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
vperm2i128 xtmpa, xgft4_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
||||
vperm2i128 x0, xgft4_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
; " Dx{00}, Dx{10}, ..., Dx{f0}
|
||||
|
||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
||||
add tmp, 32
|
||||
%else ;32-bit code
|
||||
mov cl, 0x0f ;use ecx as a temp variable
|
||||
vpinsrb xmask0fx, xmask0fx, ecx, 0
|
||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||
|
||||
vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
vperm2i128 xtmpa, xgft4_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
||||
vperm2i128 x0, xgft4_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
||||
%endif
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
%ifidn PS,4 ; 32-bit code
|
||||
vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
||||
%endif
|
||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
vpxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
%ifidn PS,4 ; 32-bit code
|
||||
sal vec, 1
|
||||
vmovdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
||||
sar vec, 1
|
||||
%endif
|
||||
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
vpxor xp3, xgft3_hi ;xp3 += partial
|
||||
|
||||
%ifidn PS,4 ; 32-bit code
|
||||
SLDR vskip3, vskip3_m
|
||||
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
; " DX{00}, Dx{10}, ..., Dx{f0}
|
||||
vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
||||
add tmp, 32
|
||||
%endif
|
||||
vpshufb xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
vpxor xp4, xgft4_hi ;xp4 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
SLDR dest1, dest1_m
|
||||
SLDR dest2, dest2_m
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
SLDR dest3, dest3_m
|
||||
XSTR [dest3+pos], xp3
|
||||
SLDR dest4, dest4_m
|
||||
XSTR [dest4+pos], xp4
|
||||
|
||||
SLDR len, len_m
|
||||
add pos, 32 ;Loop on 32 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop32
|
||||
|
||||
lea tmp, [len + 32]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-32
|
||||
jmp .loop32 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_4vect_dot_prod_avx2, 04, 05, 0198
|
443
erasure_code/gf_4vect_dot_prod_sse.asm
Normal file
443
erasure_code/gf_4vect_dot_prod_sse.asm
Normal file
@ -0,0 +1,443 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_4vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define tmp5 r14 ; must be saved and restored
|
||||
%define tmp6 r15 ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define tmp5 rdi ; must be saved and restored
|
||||
%define tmp6 rsi ; must be saved and restored
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 9*16 + 7*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
save_xmm128 xmm6, 0*16
|
||||
save_xmm128 xmm7, 1*16
|
||||
save_xmm128 xmm8, 2*16
|
||||
save_xmm128 xmm9, 3*16
|
||||
save_xmm128 xmm10, 4*16
|
||||
save_xmm128 xmm11, 5*16
|
||||
save_xmm128 xmm12, 6*16
|
||||
save_xmm128 xmm13, 7*16
|
||||
save_xmm128 xmm14, 8*16
|
||||
save_reg r12, 9*16 + 0*8
|
||||
save_reg r13, 9*16 + 1*8
|
||||
save_reg r14, 9*16 + 2*8
|
||||
save_reg r15, 9*16 + 3*8
|
||||
save_reg rdi, 9*16 + 4*8
|
||||
save_reg rsi, 9*16 + 5*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp + 0*16]
|
||||
movdqa xmm7, [rsp + 1*16]
|
||||
movdqa xmm8, [rsp + 2*16]
|
||||
movdqa xmm9, [rsp + 3*16]
|
||||
movdqa xmm10, [rsp + 4*16]
|
||||
movdqa xmm11, [rsp + 5*16]
|
||||
movdqa xmm12, [rsp + 6*16]
|
||||
movdqa xmm13, [rsp + 7*16]
|
||||
movdqa xmm14, [rsp + 8*16]
|
||||
mov r12, [rsp + 9*16 + 0*8]
|
||||
mov r13, [rsp + 9*16 + 1*8]
|
||||
mov r14, [rsp + 9*16 + 2*8]
|
||||
mov r15, [rsp + 9*16 + 3*8]
|
||||
mov rdi, [rsp + 9*16 + 4*8]
|
||||
mov rsi, [rsp + 9*16 + 5*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
|
||||
;;;================== High Address;
|
||||
;;; arg4
|
||||
;;; arg3
|
||||
;;; arg2
|
||||
;;; arg1
|
||||
;;; arg0
|
||||
;;; return
|
||||
;;;<================= esp of caller
|
||||
;;; ebp
|
||||
;;;<================= ebp = esp
|
||||
;;; var0
|
||||
;;; var1
|
||||
;;; var2
|
||||
;;; var3
|
||||
;;; esi
|
||||
;;; edi
|
||||
;;; ebx
|
||||
;;;<================= esp of callee
|
||||
;;;
|
||||
;;;================== Low Address;
|
||||
|
||||
%define PS 4
|
||||
%define LOG_PS 2
|
||||
%define func(x) x:
|
||||
%define arg(x) [ebp + PS*2 + PS*x]
|
||||
%define var(x) [ebp - PS - PS*x]
|
||||
|
||||
%define trans ecx
|
||||
%define trans2 esi
|
||||
%define arg0 trans ;trans and trans2 are for the variables in stack
|
||||
%define arg0_m arg(0)
|
||||
%define arg1 ebx
|
||||
%define arg2 arg2_m
|
||||
%define arg2_m arg(2)
|
||||
%define arg3 trans
|
||||
%define arg3_m arg(3)
|
||||
%define arg4 trans
|
||||
%define arg4_m arg(4)
|
||||
%define arg5 trans2
|
||||
%define tmp edx
|
||||
%define tmp2 edi
|
||||
%define tmp3 trans2
|
||||
%define tmp3_m var(0)
|
||||
%define tmp4 trans2
|
||||
%define tmp4_m var(1)
|
||||
%define tmp5 trans2
|
||||
%define tmp5_m var(2)
|
||||
%define tmp6 trans2
|
||||
%define tmp6_m var(3)
|
||||
%define return eax
|
||||
%macro SLDR 2 ;stack load/restore
|
||||
mov %1, %2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
sub esp, PS*4 ;4 local variables
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
mov arg1, arg(1)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
add esp, PS*4 ;4 local variables
|
||||
pop ebp
|
||||
%endmacro
|
||||
|
||||
%endif ; output formats
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest1 arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest2 tmp3
|
||||
%define dest3 tmp4
|
||||
%define dest4 tmp5
|
||||
%define vskip3 tmp6
|
||||
%define pos return
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
%define len_m arg0_m
|
||||
%define src_m arg3_m
|
||||
%define dest1_m arg4_m
|
||||
%define dest2_m tmp3_m
|
||||
%define dest3_m tmp4_m
|
||||
%define dest4_m tmp5_m
|
||||
%define vskip3_m tmp6_m
|
||||
%endif
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR movdqu
|
||||
%define XSTR movdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR movdqa
|
||||
%define XSTR movdqa
|
||||
%else
|
||||
%define XLDR movntdqa
|
||||
%define XSTR movntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%ifidn PS,8 ; 64-bit code
|
||||
default rel
|
||||
[bits 64]
|
||||
%endif
|
||||
|
||||
|
||||
section .text
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
%define xmask0f xmm14
|
||||
%define xgft1_lo xmm2
|
||||
%define xgft1_hi xmm3
|
||||
%define xgft2_lo xmm11
|
||||
%define xgft2_hi xmm4
|
||||
%define xgft3_lo xmm9
|
||||
%define xgft3_hi xmm5
|
||||
%define xgft4_lo xmm7
|
||||
%define xgft4_hi xmm6
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm8
|
||||
%define xp2 xmm10
|
||||
%define xp3 xmm12
|
||||
%define xp4 xmm13
|
||||
%else
|
||||
%define xmm_trans xmm7 ;reuse xmask0f and xgft1_lo
|
||||
%define xmask0f xmm_trans
|
||||
%define xgft1_lo xmm_trans
|
||||
%define xgft1_hi xmm6
|
||||
%define xgft2_lo xgft1_lo
|
||||
%define xgft2_hi xgft1_hi
|
||||
%define xgft3_lo xgft1_lo
|
||||
%define xgft3_hi xgft1_hi
|
||||
%define xgft4_lo xgft1_lo
|
||||
%define xgft4_hi xgft1_hi
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm2
|
||||
%define xp2 xmm3
|
||||
%define xp3 xmm4
|
||||
%define xp4 xmm5
|
||||
%endif
|
||||
align 16
|
||||
global gf_4vect_dot_prod_sse:function
|
||||
func(gf_4vect_dot_prod_sse)
|
||||
FUNC_SAVE
|
||||
SLDR len, len_m
|
||||
sub len, 16
|
||||
SSTR len_m, len
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
mov vskip3, vec
|
||||
imul vskip3, 96
|
||||
SSTR vskip3_m, vskip3
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
SLDR dest1, dest1_m
|
||||
mov dest2, [dest1+PS]
|
||||
SSTR dest2_m, dest2
|
||||
mov dest3, [dest1+2*PS]
|
||||
SSTR dest3_m, dest3
|
||||
mov dest4, [dest1+3*PS]
|
||||
SSTR dest4_m, dest4
|
||||
mov dest1, [dest1]
|
||||
SSTR dest1_m, dest1
|
||||
|
||||
.loop16:
|
||||
pxor xp1, xp1
|
||||
pxor xp2, xp2
|
||||
pxor xp3, xp3
|
||||
pxor xp4, xp4
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
.next_vect:
|
||||
SLDR src, src_m
|
||||
mov ptr, [src+vec_i]
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
movdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
movdqu xgft3_hi, [tmp+vec*(64/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
movdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
movdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
|
||||
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
%else ;32-bit code
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
%endif
|
||||
|
||||
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
pxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
%endif
|
||||
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
pxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
sal vec, 1
|
||||
movdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
movdqu xgft3_hi, [tmp+vec*(32/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
sar vec, 1
|
||||
%endif
|
||||
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
pxor xp3, xgft3_hi ;xp3 += partial
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
SLDR vskip3, vskip3_m
|
||||
movdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
movdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
|
||||
add tmp, 32
|
||||
add vec_i, PS
|
||||
%endif
|
||||
pshufb xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
pxor xp4, xgft4_hi ;xp4 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
SLDR dest1, dest1_m
|
||||
SLDR dest2, dest2_m
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
SLDR dest3, dest3_m
|
||||
XSTR [dest3+pos], xp3
|
||||
SLDR dest4, dest4_m
|
||||
XSTR [dest4+pos], xp4
|
||||
|
||||
SLDR len, len_m
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop16 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_4vect_dot_prod_sse, 00, 06, 0064
|
281
erasure_code/gf_4vect_dot_prod_sse_perf.c
Normal file
281
erasure_code/gf_4vect_dot_prod_sse_perf.c
Normal file
@ -0,0 +1,281 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_4vect_dot_prod_sse
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 10
|
||||
# define TEST_LEN 8*1024
|
||||
# define TEST_LOOPS 40000
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 10
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
|
||||
# define TEST_LOOPS 100
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j;
|
||||
void *buf;
|
||||
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
|
||||
u8 g4[TEST_SOURCES], g_tbls[4 * TEST_SOURCES * 32], *buffs[TEST_SOURCES];
|
||||
u8 *dest1, *dest2, *dest3, *dest4, *dest_ref1, *dest_ref2, *dest_ref3;
|
||||
u8 *dest_ref4, *dest_ptrs[4];
|
||||
struct perf start, stop;
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest4 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref4 = buf;
|
||||
|
||||
dest_ptrs[0] = dest1;
|
||||
dest_ptrs[1] = dest2;
|
||||
dest_ptrs[2] = dest3;
|
||||
dest_ptrs[3] = dest4;
|
||||
|
||||
// Performance test
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
memset(dest1, 0, TEST_LEN);
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest3, 0, TEST_LEN);
|
||||
memset(dest4, 0, TEST_LEN);
|
||||
memset(dest_ref1, 0, TEST_LEN);
|
||||
memset(dest_ref2, 0, TEST_LEN);
|
||||
memset(dest_ref3, 0, TEST_LEN);
|
||||
memset(dest_ref4, 0, TEST_LEN);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
}
|
||||
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
|
||||
dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
|
||||
dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
|
||||
dest_ref4);
|
||||
|
||||
#ifdef DO_REF_PERF
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS / 100; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
|
||||
buffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
|
||||
buffs, dest_ref4);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("gf_4vect_dot_prod_base" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 4) * i);
|
||||
#endif
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 4) * i);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("pass perf check\n");
|
||||
return 0;
|
||||
|
||||
}
|
692
erasure_code/gf_4vect_dot_prod_sse_test.c
Normal file
692
erasure_code/gf_4vect_dot_prod_sse_test.c
Normal file
@ -0,0 +1,692 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_4vect_dot_prod_sse
|
||||
#endif
|
||||
#ifndef TEST_MIN_SIZE
|
||||
# define TEST_MIN_SIZE 16
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
#define TEST_MEM TEST_SIZE
|
||||
#define TEST_LOOPS 10000
|
||||
#define TEST_TYPE_STR ""
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 16
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 20
|
||||
#endif
|
||||
|
||||
#ifdef EC_ALIGNED_ADDR
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 0
|
||||
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
|
||||
#else
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 32
|
||||
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, srcs;
|
||||
void *buf;
|
||||
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
|
||||
u8 g4[TEST_SOURCES], g_tbls[4 * TEST_SOURCES * 32], *buffs[TEST_SOURCES];
|
||||
u8 *dest1, *dest2, *dest3, *dest4, *dest_ref1, *dest_ref2, *dest_ref3;
|
||||
u8 *dest_ref4, *dest_ptrs[4];
|
||||
|
||||
int align, size;
|
||||
unsigned char *efence_buffs[TEST_SOURCES];
|
||||
unsigned int offset;
|
||||
u8 *ubuffs[TEST_SOURCES];
|
||||
u8 *udest_ptrs[4];
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest4 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref4 = buf;
|
||||
|
||||
dest_ptrs[0] = dest1;
|
||||
dest_ptrs[1] = dest2;
|
||||
dest_ptrs[2] = dest3;
|
||||
dest_ptrs[3] = dest4;
|
||||
|
||||
// Test of all zeros
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
memset(buffs[i], 0, TEST_LEN);
|
||||
|
||||
memset(dest1, 0, TEST_LEN);
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest3, 0, TEST_LEN);
|
||||
memset(dest4, 0, TEST_LEN);
|
||||
memset(dest_ref1, 0, TEST_LEN);
|
||||
memset(dest_ref2, 0, TEST_LEN);
|
||||
memset(dest_ref3, 0, TEST_LEN);
|
||||
memset(dest_ref4, 0, TEST_LEN);
|
||||
memset(g1, 2, TEST_SOURCES);
|
||||
memset(g2, 1, TEST_SOURCES);
|
||||
memset(g3, 7, TEST_SOURCES);
|
||||
memset(g4, 3, TEST_SOURCES);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
|
||||
dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
|
||||
dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
|
||||
dest_ref4);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
|
||||
// Rand data test
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
|
||||
buffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
|
||||
buffs, dest_ref4);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Rand data test with varied parameters
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
|
||||
dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
|
||||
dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs,
|
||||
dest_ref4);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test1 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test2 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test3 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test4 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 32;
|
||||
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
|
||||
efence_buffs[i] = buffs[i] + TEST_LEN - size;
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref4);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref2, dest2, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref3, dest3, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref4, dest4, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test rand ptr alignment if available
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
|
||||
srcs = rand() % TEST_SOURCES;
|
||||
if (srcs == 0)
|
||||
continue;
|
||||
|
||||
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
|
||||
// Add random offsets
|
||||
for (i = 0; i < srcs; i++)
|
||||
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
memset(dest1, 0, TEST_LEN); // zero pad to check write-over
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest3, 0, TEST_LEN);
|
||||
memset(dest4, 0, TEST_LEN);
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
ubuffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
|
||||
|
||||
if (memcmp(dest_ref1, udest_ptrs[0], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[0], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref2, udest_ptrs[1], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[1], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref3, udest_ptrs[2], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[2], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref4, udest_ptrs[3], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[3], 25);
|
||||
return -1;
|
||||
}
|
||||
// Confirm that padding around dests is unchanged
|
||||
memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
|
||||
offset = udest_ptrs[0] - dest1;
|
||||
|
||||
if (memcmp(dest1, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad1 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad1 end\n");
|
||||
printf("size=%d offset=%d srcs=%d\n", size, offset, srcs);
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[1] - dest2;
|
||||
if (memcmp(dest2, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad2 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad2 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[2] - dest3;
|
||||
if (memcmp(dest3, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad3 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad3 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[3] - dest4;
|
||||
if (memcmp(dest4, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad4 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad4 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test all size alignment
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 32;
|
||||
|
||||
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
|
||||
srcs = TEST_SOURCES;
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (memcmp(dest_ref1, dest_ptrs[0], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[0], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref2, dest_ptrs[1], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[1], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref3, dest_ptrs[2], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[2], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref4, dest_ptrs[3], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[3], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("Pass\n");
|
||||
return 0;
|
||||
|
||||
}
|
336
erasure_code/gf_4vect_mad_avx.asm
Normal file
336
erasure_code/gf_4vect_mad_avx.asm
Normal file
@ -0,0 +1,336 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_4vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*10 + 3*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
movdqa [rsp+16*0],xmm6
|
||||
movdqa [rsp+16*1],xmm7
|
||||
movdqa [rsp+16*2],xmm8
|
||||
movdqa [rsp+16*3],xmm9
|
||||
movdqa [rsp+16*4],xmm10
|
||||
movdqa [rsp+16*5],xmm11
|
||||
movdqa [rsp+16*6],xmm12
|
||||
movdqa [rsp+16*7],xmm13
|
||||
movdqa [rsp+16*8],xmm14
|
||||
movdqa [rsp+16*9],xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r13, 10*16 + 1*8
|
||||
save_reg r15, 10*16 + 2*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp+16*0]
|
||||
movdqa xmm7, [rsp+16*1]
|
||||
movdqa xmm8, [rsp+16*2]
|
||||
movdqa xmm9, [rsp+16*3]
|
||||
movdqa xmm10, [rsp+16*4]
|
||||
movdqa xmm11, [rsp+16*5]
|
||||
movdqa xmm12, [rsp+16*6]
|
||||
movdqa xmm13, [rsp+16*7]
|
||||
movdqa xmm14, [rsp+16*8]
|
||||
movdqa xmm15, [rsp+16*9]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r13, [rsp + 10*16 + 1*8]
|
||||
mov r15, [rsp + 10*16 + 2*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r12
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
;;; gf_4vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 mul_array
|
||||
%define dest3 tmp2
|
||||
%define dest4 vec_i
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft3_hi xmm14
|
||||
%define xgft4_hi xmm13
|
||||
%define xgft4_lo xmm12
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xtmph1 xmm2
|
||||
%define xtmpl1 xmm3
|
||||
%define xtmph2 xmm4
|
||||
%define xtmpl2 xmm5
|
||||
%define xtmph3 xmm6
|
||||
%define xtmpl3 xmm7
|
||||
%define xtmph4 xmm8
|
||||
%define xtmpl4 xmm9
|
||||
%define xd1 xmm10
|
||||
%define xd2 xmm11
|
||||
%define xd3 xtmph1
|
||||
%define xd4 xtmpl1
|
||||
|
||||
align 16
|
||||
global gf_4vect_mad_avx:function
|
||||
func(gf_4vect_mad_avx)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
|
||||
mov tmp, vec
|
||||
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
lea tmp3, [mul_array + vec_i]
|
||||
|
||||
sal tmp, 6 ;Multiply by 64
|
||||
vmovdqu xgft3_hi, [tmp3+tmp+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
sal vec, 5 ;Multiply by 32
|
||||
add tmp, vec
|
||||
vmovdqu xgft4_lo, [tmp3+tmp] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
|
||||
vmovdqu xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
|
||||
|
||||
mov dest2, [dest1+PS] ; reuse mul_array
|
||||
mov dest3, [dest1+2*PS]
|
||||
mov dest4, [dest1+3*PS] ; reuse vec_i
|
||||
mov dest1, [dest1]
|
||||
|
||||
.loop16:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
XLDR xd2, [dest2+pos] ;Get next dest vector
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl1, xtmpl1, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
|
||||
vpxor xd1, xd1, xtmph1
|
||||
|
||||
XLDR xd3, [dest3+pos] ;Reuse xtmph1, Get next dest vector
|
||||
XLDR xd4, [dest4+pos] ;Reuse xtmpl1, Get next dest vector
|
||||
|
||||
; dest2
|
||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl2, xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
|
||||
vpxor xd2, xd2, xtmph2
|
||||
|
||||
; dest3
|
||||
vpshufb xtmph3, xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl3, xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials
|
||||
vpxor xd3, xd3, xtmph3
|
||||
|
||||
; dest4
|
||||
vpshufb xtmph4, xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl4, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph4, xtmph4, xtmpl4 ;GF add high and low partials
|
||||
vpxor xd4, xd4, xtmph4
|
||||
|
||||
XSTR [dest1+pos], xd1 ;Store result
|
||||
XSTR [dest2+pos], xd2 ;Store result
|
||||
XSTR [dest3+pos], xd3 ;Store result
|
||||
XSTR [dest4+pos], xd4 ;Store result
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
.lessthan16:
|
||||
;; Tail len
|
||||
;; Do one more overlap pass
|
||||
|
||||
mov tmp, len ;Overlapped offset length-16
|
||||
|
||||
XLDR x0, [src+tmp] ;Get next source vector
|
||||
|
||||
vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
|
||||
XLDR xd1, [dest1+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest2+tmp] ;Get next dest vector
|
||||
XLDR xtmph4, [dest3+tmp] ;Get next dest vector
|
||||
|
||||
sub len, pos
|
||||
|
||||
vmovdqa xtmpl4, [constip16] ;Load const of i + 16
|
||||
vpinsrb xtmph3, xtmph3, len.w, 15
|
||||
vpshufb xtmph3, xtmph3, xmask0f ;Broadcast len to all bytes
|
||||
vpcmpgtb xtmph3, xtmph3, xtmpl4
|
||||
|
||||
XLDR xtmpl4, [dest4+tmp] ;Get next dest vector
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl1, xtmpl1, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
|
||||
vpand xtmph1, xtmph1, xtmph3
|
||||
vpxor xd1, xd1, xtmph1
|
||||
|
||||
; dest2
|
||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl2, xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
|
||||
vpand xtmph2, xtmph2, xtmph3
|
||||
vpxor xd2, xd2, xtmph2
|
||||
|
||||
; dest3
|
||||
vpshufb xgft3_hi, xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl3, xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft3_hi, xgft3_hi, xtmpl3 ;GF add high and low partials
|
||||
vpand xgft3_hi, xgft3_hi, xtmph3
|
||||
vpxor xtmph4, xtmph4, xgft3_hi
|
||||
|
||||
; dest4
|
||||
vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
vpand xgft4_hi, xgft4_hi, xtmph3
|
||||
vpxor xtmpl4, xtmpl4, xgft4_hi
|
||||
|
||||
XSTR [dest1+tmp], xd1 ;Store result
|
||||
XSTR [dest2+tmp], xd2 ;Store result
|
||||
XSTR [dest3+tmp], xtmph4 ;Store result
|
||||
XSTR [dest4+tmp], xtmpl4 ;Store result
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
constip16:
|
||||
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_4vect_mad_avx, 02, 01, 020a
|
342
erasure_code/gf_4vect_mad_avx2.asm
Normal file
342
erasure_code/gf_4vect_mad_avx2.asm
Normal file
@ -0,0 +1,342 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_4vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*10 + 3*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
movdqa [rsp+16*0],xmm6
|
||||
movdqa [rsp+16*1],xmm7
|
||||
movdqa [rsp+16*2],xmm8
|
||||
movdqa [rsp+16*3],xmm9
|
||||
movdqa [rsp+16*4],xmm10
|
||||
movdqa [rsp+16*5],xmm11
|
||||
movdqa [rsp+16*6],xmm12
|
||||
movdqa [rsp+16*7],xmm13
|
||||
movdqa [rsp+16*8],xmm14
|
||||
movdqa [rsp+16*9],xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r15, 10*16 + 1*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp+16*0]
|
||||
movdqa xmm7, [rsp+16*1]
|
||||
movdqa xmm8, [rsp+16*2]
|
||||
movdqa xmm9, [rsp+16*3]
|
||||
movdqa xmm10, [rsp+16*4]
|
||||
movdqa xmm11, [rsp+16*5]
|
||||
movdqa xmm12, [rsp+16*6]
|
||||
movdqa xmm13, [rsp+16*7]
|
||||
movdqa xmm14, [rsp+16*8]
|
||||
movdqa xmm15, [rsp+16*9]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r15, [rsp + 10*16 + 1*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
|
||||
;;; gf_4vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 mul_array
|
||||
%define dest3 vec
|
||||
%define dest4 vec_i
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f ymm15
|
||||
%define xmask0fx xmm15
|
||||
%define xgft1_lo ymm14
|
||||
%define xgft2_lo ymm13
|
||||
%define xgft3_lo ymm12
|
||||
%define xgft4_lo ymm11
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xtmpl ymm2
|
||||
%define xtmplx xmm2
|
||||
%define xtmph1 ymm3
|
||||
%define xtmph1x xmm3
|
||||
%define xtmph2 ymm4
|
||||
%define xtmph3 ymm5
|
||||
%define xtmph4 ymm6
|
||||
%define xd1 ymm7
|
||||
%define xd2 ymm8
|
||||
%define xd3 ymm9
|
||||
%define xd4 ymm10
|
||||
|
||||
align 16
|
||||
global gf_4vect_mad_avx2:function
|
||||
func(gf_4vect_mad_avx2)
|
||||
FUNC_SAVE
|
||||
sub len, 32
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
mov tmp.b, 0x0f
|
||||
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
|
||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
sal vec, 5 ;Multiply by 32
|
||||
lea tmp, [mul_array + vec_i]
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
add tmp, vec
|
||||
vmovdqu xgft4_lo, [tmp+2*vec] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
|
||||
; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
|
||||
|
||||
mov dest2, [dest1+PS] ; reuse mul_array
|
||||
mov dest3, [dest1+2*PS] ; reuse vec
|
||||
mov dest4, [dest1+3*PS] ; reuse vec_i
|
||||
mov dest1, [dest1]
|
||||
|
||||
.loop32:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
XLDR xd2, [dest2+pos] ;Get next dest vector
|
||||
XLDR xd3, [dest3+pos] ;Get next dest vector
|
||||
XLDR xd4, [dest4+pos] ;reuse xtmpl1. Get next dest vector
|
||||
|
||||
vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
||||
vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
||||
|
||||
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
||||
|
||||
; dest1
|
||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials
|
||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
||||
|
||||
; dest2
|
||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials
|
||||
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
||||
|
||||
; dest3
|
||||
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph3, xtmph3, xtmpl ;GF add high and low partials
|
||||
vpxor xd3, xd3, xtmph3 ;xd3 += partial
|
||||
|
||||
; dest4
|
||||
vpshufb xtmph4, xtmph4, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph4, xtmph4, xtmpl ;GF add high and low partials
|
||||
vpxor xd4, xd4, xtmph4 ;xd4 += partial
|
||||
|
||||
XSTR [dest1+pos], xd1
|
||||
XSTR [dest2+pos], xd2
|
||||
XSTR [dest3+pos], xd3
|
||||
XSTR [dest4+pos], xd4
|
||||
|
||||
add pos, 32 ;Loop on 32 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop32
|
||||
|
||||
lea tmp, [len + 32]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
.lessthan32:
|
||||
;; Tail len
|
||||
;; Do one more overlap pass
|
||||
mov tmp.b, 0x1f
|
||||
vpinsrb xtmph1x, xtmph1x, tmp.w, 0
|
||||
vpbroadcastb xtmph1, xtmph1x ;Construct mask 0x1f1f1f...
|
||||
|
||||
mov tmp, len ;Overlapped offset length-32
|
||||
|
||||
XLDR x0, [src+tmp] ;Get next source vector
|
||||
|
||||
XLDR xd1, [dest1+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest2+tmp] ;Get next dest vector
|
||||
XLDR xd3, [dest3+tmp] ;Get next dest vector
|
||||
XLDR xd4, [dest4+tmp] ;Get next dest vector
|
||||
|
||||
sub len, pos
|
||||
|
||||
vmovdqa xtmph2, [constip32] ;Load const of i + 32
|
||||
vpinsrb xtmplx, xtmplx, len.w, 15
|
||||
vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
|
||||
vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f...
|
||||
vpcmpgtb xtmpl, xtmpl, xtmph2
|
||||
|
||||
vpand xtmph1, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vperm2i128 xtmpa, xtmph1, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
||||
vperm2i128 x0, xtmph1, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
||||
|
||||
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
||||
|
||||
; dest1
|
||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xgft1_lo ;GF add high and low partials
|
||||
vpand xtmph1, xtmph1, xtmpl
|
||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
||||
|
||||
; dest2
|
||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xgft2_lo ;GF add high and low partials
|
||||
vpand xtmph2, xtmph2, xtmpl
|
||||
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
||||
|
||||
; dest3
|
||||
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph3, xtmph3, xgft3_lo ;GF add high and low partials
|
||||
vpand xtmph3, xtmph3, xtmpl
|
||||
vpxor xd3, xd3, xtmph3 ;xd3 += partial
|
||||
|
||||
; dest4
|
||||
vpshufb xtmph4, xtmph4, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph4, xtmph4, xgft4_lo ;GF add high and low partials
|
||||
vpand xtmph4, xtmph4, xtmpl
|
||||
vpxor xd4, xd4, xtmph4 ;xd4 += partial
|
||||
|
||||
XSTR [dest1+tmp], xd1
|
||||
XSTR [dest2+tmp], xd2
|
||||
XSTR [dest3+tmp], xd3
|
||||
XSTR [dest4+tmp], xd4
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
align 32
|
||||
constip32:
|
||||
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
|
||||
ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_4vect_mad_avx2, 04, 01, 020b
|
342
erasure_code/gf_4vect_mad_sse.asm
Normal file
342
erasure_code/gf_4vect_mad_sse.asm
Normal file
@ -0,0 +1,342 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*10 + 3*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
movdqa [rsp+16*0],xmm6
|
||||
movdqa [rsp+16*1],xmm7
|
||||
movdqa [rsp+16*2],xmm8
|
||||
movdqa [rsp+16*3],xmm9
|
||||
movdqa [rsp+16*4],xmm10
|
||||
movdqa [rsp+16*5],xmm11
|
||||
movdqa [rsp+16*6],xmm12
|
||||
movdqa [rsp+16*7],xmm13
|
||||
movdqa [rsp+16*8],xmm14
|
||||
movdqa [rsp+16*9],xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r13, 10*16 + 1*8
|
||||
save_reg r15, 10*16 + 2*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp+16*0]
|
||||
movdqa xmm7, [rsp+16*1]
|
||||
movdqa xmm8, [rsp+16*2]
|
||||
movdqa xmm9, [rsp+16*3]
|
||||
movdqa xmm10, [rsp+16*4]
|
||||
movdqa xmm11, [rsp+16*5]
|
||||
movdqa xmm12, [rsp+16*6]
|
||||
movdqa xmm13, [rsp+16*7]
|
||||
movdqa xmm14, [rsp+16*8]
|
||||
movdqa xmm15, [rsp+16*9]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r13, [rsp + 10*16 + 1*8]
|
||||
mov r15, [rsp + 10*16 + 2*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r12
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 mul_array
|
||||
%define dest3 tmp2
|
||||
%define dest4 vec_i
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR movdqu
|
||||
%define XSTR movdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR movdqa
|
||||
%define XSTR movdqa
|
||||
%else
|
||||
%define XLDR movntdqa
|
||||
%define XSTR movntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft3_hi xmm14
|
||||
%define xgft4_hi xmm13
|
||||
%define xgft4_lo xmm12
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xtmph1 xmm2
|
||||
%define xtmpl1 xmm3
|
||||
%define xtmph2 xmm4
|
||||
%define xtmpl2 xmm5
|
||||
%define xtmph3 xmm6
|
||||
%define xtmpl3 xmm7
|
||||
%define xtmph4 xmm8
|
||||
%define xtmpl4 xmm9
|
||||
%define xd1 xmm10
|
||||
%define xd2 xmm11
|
||||
%define xd3 xtmph1
|
||||
%define xd4 xtmpl1
|
||||
|
||||
align 16
|
||||
global gf_4vect_mad_sse:function
|
||||
func(gf_4vect_mad_sse)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
mov tmp, vec
|
||||
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
lea tmp3, [mul_array + vec_i]
|
||||
|
||||
sal tmp, 6 ;Multiply by 64
|
||||
|
||||
movdqu xgft3_hi, [tmp3+tmp+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
sal vec, 5 ;Multiply by 32
|
||||
add tmp, vec
|
||||
movdqu xgft4_lo, [tmp3+tmp] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
|
||||
movdqu xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
|
||||
|
||||
mov dest2, [dest1+PS] ; reuse mul_array
|
||||
mov dest3, [dest1+2*PS]
|
||||
mov dest4, [dest1+3*PS] ; reuse vec_i
|
||||
mov dest1, [dest1]
|
||||
|
||||
.loop16:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
|
||||
movdqa xtmph3, xgft3_hi
|
||||
movdqa xtmpl4, xgft4_lo
|
||||
movdqa xtmph4, xgft4_hi
|
||||
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
XLDR xd2, [dest2+pos] ;Get next dest vector
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
pshufb xtmph1, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph1, xtmpl1 ;GF add high and low partials
|
||||
pxor xd1, xtmph1
|
||||
|
||||
XLDR xd3, [dest3+pos] ;Reuse xtmph1, Get next dest vector
|
||||
XLDR xd4, [dest4+pos] ;Reuse xtmpl1, Get next dest vector
|
||||
|
||||
; dest2
|
||||
pshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph2, xtmpl2 ;GF add high and low partials
|
||||
pxor xd2, xtmph2
|
||||
|
||||
; dest3
|
||||
pshufb xtmph3, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph3, xtmpl3 ;GF add high and low partials
|
||||
pxor xd3, xtmph3
|
||||
|
||||
; dest4
|
||||
pshufb xtmph4, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl4, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph4, xtmpl4 ;GF add high and low partials
|
||||
pxor xd4, xtmph4
|
||||
|
||||
XSTR [dest1+pos], xd1 ;Store result
|
||||
XSTR [dest2+pos], xd2 ;Store result
|
||||
XSTR [dest3+pos], xd3 ;Store result
|
||||
XSTR [dest4+pos], xd4 ;Store result
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
.lessthan16:
|
||||
;; Tail len
|
||||
;; Do one more overlap pass
|
||||
mov tmp, len ;Overlapped offset length-16
|
||||
|
||||
XLDR x0, [src+tmp] ;Get next source vector
|
||||
|
||||
movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
|
||||
XLDR xd1, [dest1+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest2+tmp] ;Get next dest vector
|
||||
XLDR xtmph4, [dest3+tmp] ;Reuse xtmph1. Get next dest vector
|
||||
|
||||
sub len, pos
|
||||
|
||||
movdqa xtmpl4, [constip16] ;Load const of i + 16
|
||||
pinsrb xtmph3, len.w, 15
|
||||
pshufb xtmph3, xmask0f ;Broadcast len to all bytes
|
||||
pcmpgtb xtmph3, xtmpl4
|
||||
|
||||
XLDR xtmpl4, [dest4+tmp] ;Get next dest vector
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
pshufb xtmph1, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph1, xtmpl1 ;GF add high and low partials
|
||||
pand xtmph1, xtmph3
|
||||
pxor xd1, xtmph1
|
||||
|
||||
; dest2
|
||||
pshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph2, xtmpl2 ;GF add high and low partials
|
||||
pand xtmph2, xtmph3
|
||||
pxor xd2, xtmph2
|
||||
|
||||
; dest3
|
||||
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft3_hi, xtmpl3 ;GF add high and low partials
|
||||
pand xgft3_hi, xtmph3
|
||||
pxor xtmph4, xgft3_hi
|
||||
|
||||
; dest4
|
||||
pshufb xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
pand xgft4_hi, xtmph3
|
||||
pxor xtmpl4, xgft4_hi
|
||||
|
||||
XSTR [dest1+tmp], xd1 ;Store result
|
||||
XSTR [dest2+tmp], xd2 ;Store result
|
||||
XSTR [dest3+tmp], xtmph4 ;Store result
|
||||
XSTR [dest4+tmp], xtmpl4 ;Store result
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
|
||||
mask0f:
|
||||
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
constip16:
|
||||
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_4vect_mad_sse, 00, 01, 0209
|
303
erasure_code/gf_5vect_dot_prod_avx.asm
Normal file
303
erasure_code/gf_5vect_dot_prod_avx.asm
Normal file
@ -0,0 +1,303 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_5vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define tmp5 r14 ; must be saved and restored
|
||||
%define tmp6 r15 ; must be saved and restored
|
||||
%define return rax
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define tmp5 rdi ; must be saved and restored
|
||||
%define tmp6 rsi ; must be saved and restored
|
||||
%define return rax
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
save_xmm128 xmm6, 0*16
|
||||
save_xmm128 xmm7, 1*16
|
||||
save_xmm128 xmm8, 2*16
|
||||
save_xmm128 xmm9, 3*16
|
||||
save_xmm128 xmm10, 4*16
|
||||
save_xmm128 xmm11, 5*16
|
||||
save_xmm128 xmm12, 6*16
|
||||
save_xmm128 xmm13, 7*16
|
||||
save_xmm128 xmm14, 8*16
|
||||
save_xmm128 xmm15, 9*16
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r13, 10*16 + 1*8
|
||||
save_reg r14, 10*16 + 2*8
|
||||
save_reg r15, 10*16 + 3*8
|
||||
save_reg rdi, 10*16 + 4*8
|
||||
save_reg rsi, 10*16 + 5*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
vmovdqa xmm9, [rsp + 3*16]
|
||||
vmovdqa xmm10, [rsp + 4*16]
|
||||
vmovdqa xmm11, [rsp + 5*16]
|
||||
vmovdqa xmm12, [rsp + 6*16]
|
||||
vmovdqa xmm13, [rsp + 7*16]
|
||||
vmovdqa xmm14, [rsp + 8*16]
|
||||
vmovdqa xmm15, [rsp + 9*16]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r13, [rsp + 10*16 + 1*8]
|
||||
mov r14, [rsp + 10*16 + 2*8]
|
||||
mov r15, [rsp + 10*16 + 3*8]
|
||||
mov rdi, [rsp + 10*16 + 4*8]
|
||||
mov rsi, [rsp + 10*16 + 5*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest1 tmp3
|
||||
%define dest2 tmp4
|
||||
%define vskip1 tmp5
|
||||
%define vskip3 tmp6
|
||||
%define pos return
|
||||
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft1_lo xmm14
|
||||
%define xgft1_hi xmm13
|
||||
%define xgft2_lo xmm12
|
||||
%define xgft2_hi xmm11
|
||||
%define xgft3_lo xmm10
|
||||
%define xgft3_hi xmm9
|
||||
%define xgft4_lo xmm8
|
||||
%define xgft4_hi xmm7
|
||||
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm2
|
||||
%define xp2 xmm3
|
||||
%define xp3 xmm4
|
||||
%define xp4 xmm5
|
||||
%define xp5 xmm6
|
||||
|
||||
align 16
|
||||
global gf_5vect_dot_prod_avx:function
|
||||
func(gf_5vect_dot_prod_avx)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
mov vskip1, vec
|
||||
imul vskip1, 32
|
||||
mov vskip3, vec
|
||||
imul vskip3, 96
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
mov dest1, [dest]
|
||||
mov dest2, [dest+PS]
|
||||
|
||||
|
||||
.loop16:
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
vpxor xp1, xp1
|
||||
vpxor xp2, xp2
|
||||
vpxor xp3, xp3
|
||||
vpxor xp4, xp4
|
||||
vpxor xp5, xp5
|
||||
|
||||
|
||||
.next_vect:
|
||||
mov ptr, [src+vec_i]
|
||||
add vec_i, PS
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
vmovdqu xgft2_hi, [tmp+vskip1*1+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
vmovdqu xgft3_hi, [tmp+vskip1*2+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
vmovdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
vpxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
vmovdqu xgft1_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
vmovdqu xgft1_hi, [tmp+vskip1*4+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
|
||||
add tmp, 32
|
||||
|
||||
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
vpxor xp3, xgft3_hi ;xp3 += partial
|
||||
|
||||
vpshufb xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
vpxor xp4, xgft4_hi ;xp4 += partial
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp5, xgft1_hi ;xp5 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
mov tmp, [dest+2*PS]
|
||||
mov ptr, [dest+3*PS]
|
||||
mov vec_i, [dest+4*PS]
|
||||
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
XSTR [tmp+pos], xp3
|
||||
XSTR [ptr+pos], xp4
|
||||
XSTR [vec_i+pos], xp5
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop16 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_5vect_dot_prod_avx, 02, 04, 0194
|
315
erasure_code/gf_5vect_dot_prod_avx2.asm
Normal file
315
erasure_code/gf_5vect_dot_prod_avx2.asm
Normal file
@ -0,0 +1,315 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_5vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define tmp5 r14 ; must be saved and restored
|
||||
%define tmp6 r15 ; must be saved and restored
|
||||
%define return rax
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define tmp5 rdi ; must be saved and restored
|
||||
%define tmp6 rsi ; must be saved and restored
|
||||
%define return rax
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
vmovdqa [rsp + 0*16], xmm6
|
||||
vmovdqa [rsp + 1*16], xmm7
|
||||
vmovdqa [rsp + 2*16], xmm8
|
||||
vmovdqa [rsp + 3*16], xmm9
|
||||
vmovdqa [rsp + 4*16], xmm10
|
||||
vmovdqa [rsp + 5*16], xmm11
|
||||
vmovdqa [rsp + 6*16], xmm12
|
||||
vmovdqa [rsp + 7*16], xmm13
|
||||
vmovdqa [rsp + 8*16], xmm14
|
||||
vmovdqa [rsp + 9*16], xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r13, 10*16 + 1*8
|
||||
save_reg r14, 10*16 + 2*8
|
||||
save_reg r15, 10*16 + 3*8
|
||||
save_reg rdi, 10*16 + 4*8
|
||||
save_reg rsi, 10*16 + 5*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
vmovdqa xmm9, [rsp + 3*16]
|
||||
vmovdqa xmm10, [rsp + 4*16]
|
||||
vmovdqa xmm11, [rsp + 5*16]
|
||||
vmovdqa xmm12, [rsp + 6*16]
|
||||
vmovdqa xmm13, [rsp + 7*16]
|
||||
vmovdqa xmm14, [rsp + 8*16]
|
||||
vmovdqa xmm15, [rsp + 9*16]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r13, [rsp + 10*16 + 1*8]
|
||||
mov r14, [rsp + 10*16 + 2*8]
|
||||
mov r15, [rsp + 10*16 + 3*8]
|
||||
mov rdi, [rsp + 10*16 + 4*8]
|
||||
mov rsi, [rsp + 10*16 + 5*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest1 tmp3
|
||||
%define dest2 tmp4
|
||||
%define vskip1 tmp5
|
||||
%define vskip3 tmp6
|
||||
%define pos return
|
||||
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f ymm15
|
||||
%define xmask0fx xmm15
|
||||
%define xgft1_lo ymm14
|
||||
%define xgft1_hi ymm13
|
||||
%define xgft2_lo ymm12
|
||||
%define xgft2_hi ymm11
|
||||
%define xgft3_lo ymm10
|
||||
%define xgft3_hi ymm9
|
||||
%define xgft4_lo ymm8
|
||||
%define xgft4_hi ymm7
|
||||
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xp1 ymm2
|
||||
%define xp2 ymm3
|
||||
%define xp3 ymm4
|
||||
%define xp4 ymm5
|
||||
%define xp5 ymm6
|
||||
|
||||
align 16
|
||||
global gf_5vect_dot_prod_avx2:function
|
||||
func(gf_5vect_dot_prod_avx2)
|
||||
FUNC_SAVE
|
||||
sub len, 32
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
mov tmp.b, 0x0f
|
||||
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
|
||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||
mov vskip1, vec
|
||||
imul vskip1, 32
|
||||
mov vskip3, vec
|
||||
imul vskip3, 96
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
mov dest1, [dest]
|
||||
mov dest2, [dest+PS]
|
||||
|
||||
|
||||
.loop32:
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
vpxor xp1, xp1
|
||||
vpxor xp2, xp2
|
||||
vpxor xp3, xp3
|
||||
vpxor xp4, xp4
|
||||
vpxor xp5, xp5
|
||||
|
||||
|
||||
.next_vect:
|
||||
mov ptr, [src+vec_i]
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
add vec_i, PS
|
||||
|
||||
vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
vperm2i128 xtmpa, xgft4_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
||||
vperm2i128 x0, xgft4_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
; " Dx{00}, Dx{10}, ..., Dx{f0}
|
||||
|
||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
vpxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
vmovdqu xgft1_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
; " Ex{00}, Ex{10}, ..., Ex{f0}
|
||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
||||
add tmp, 32
|
||||
|
||||
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
vpxor xp3, xgft3_hi ;xp3 += partial
|
||||
|
||||
vpshufb xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
vpxor xp4, xgft4_hi ;xp4 += partial
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp5, xgft1_hi ;xp5 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
mov tmp, [dest+2*PS]
|
||||
mov ptr, [dest+3*PS]
|
||||
mov vec_i, [dest+4*PS]
|
||||
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
XSTR [tmp+pos], xp3
|
||||
XSTR [ptr+pos], xp4
|
||||
XSTR [vec_i+pos], xp5
|
||||
|
||||
add pos, 32 ;Loop on 32 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop32
|
||||
|
||||
lea tmp, [len + 32]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop32 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_5vect_dot_prod_avx2, 04, 04, 0199
|
304
erasure_code/gf_5vect_dot_prod_sse.asm
Normal file
304
erasure_code/gf_5vect_dot_prod_sse.asm
Normal file
@ -0,0 +1,304 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_5vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define tmp5 r14 ; must be saved and restored
|
||||
%define tmp6 r15 ; must be saved and restored
|
||||
%define return rax
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define tmp5 rdi ; must be saved and restored
|
||||
%define tmp6 rsi ; must be saved and restored
|
||||
%define return rax
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
save_xmm128 xmm6, 0*16
|
||||
save_xmm128 xmm7, 1*16
|
||||
save_xmm128 xmm8, 2*16
|
||||
save_xmm128 xmm9, 3*16
|
||||
save_xmm128 xmm10, 4*16
|
||||
save_xmm128 xmm11, 5*16
|
||||
save_xmm128 xmm12, 6*16
|
||||
save_xmm128 xmm13, 7*16
|
||||
save_xmm128 xmm14, 8*16
|
||||
save_xmm128 xmm15, 9*16
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r13, 10*16 + 1*8
|
||||
save_reg r14, 10*16 + 2*8
|
||||
save_reg r15, 10*16 + 3*8
|
||||
save_reg rdi, 10*16 + 4*8
|
||||
save_reg rsi, 10*16 + 5*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp + 0*16]
|
||||
movdqa xmm7, [rsp + 1*16]
|
||||
movdqa xmm8, [rsp + 2*16]
|
||||
movdqa xmm9, [rsp + 3*16]
|
||||
movdqa xmm10, [rsp + 4*16]
|
||||
movdqa xmm11, [rsp + 5*16]
|
||||
movdqa xmm12, [rsp + 6*16]
|
||||
movdqa xmm13, [rsp + 7*16]
|
||||
movdqa xmm14, [rsp + 8*16]
|
||||
movdqa xmm15, [rsp + 9*16]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r13, [rsp + 10*16 + 1*8]
|
||||
mov r14, [rsp + 10*16 + 2*8]
|
||||
mov r15, [rsp + 10*16 + 3*8]
|
||||
mov rdi, [rsp + 10*16 + 4*8]
|
||||
mov rsi, [rsp + 10*16 + 5*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest1 tmp3
|
||||
%define dest2 tmp4
|
||||
%define vskip1 tmp5
|
||||
%define vskip3 tmp6
|
||||
%define pos return
|
||||
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR movdqu
|
||||
%define XSTR movdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR movdqa
|
||||
%define XSTR movdqa
|
||||
%else
|
||||
%define XLDR movntdqa
|
||||
%define XSTR movntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft1_lo xmm2
|
||||
%define xgft1_hi xmm3
|
||||
%define xgft2_lo xmm4
|
||||
%define xgft2_hi xmm5
|
||||
%define xgft3_lo xmm10
|
||||
%define xgft3_hi xmm6
|
||||
%define xgft4_lo xmm8
|
||||
%define xgft4_hi xmm7
|
||||
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm9
|
||||
%define xp2 xmm11
|
||||
%define xp3 xmm12
|
||||
%define xp4 xmm13
|
||||
%define xp5 xmm14
|
||||
|
||||
align 16
|
||||
global gf_5vect_dot_prod_sse:function
|
||||
func(gf_5vect_dot_prod_sse)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
mov vskip1, vec
|
||||
imul vskip1, 32
|
||||
mov vskip3, vec
|
||||
imul vskip3, 96
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
mov dest1, [dest]
|
||||
mov dest2, [dest+PS]
|
||||
|
||||
|
||||
.loop16:
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
pxor xp1, xp1
|
||||
pxor xp2, xp2
|
||||
pxor xp3, xp3
|
||||
pxor xp4, xp4
|
||||
pxor xp5, xp5
|
||||
|
||||
|
||||
.next_vect:
|
||||
mov ptr, [src+vec_i]
|
||||
add vec_i, PS
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
|
||||
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
movdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
movdqu xgft2_hi, [tmp+vskip1*1+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
movdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
movdqu xgft3_hi, [tmp+vskip1*2+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
movdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
movdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
pxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
pxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
movdqu xgft1_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
movdqu xgft1_hi, [tmp+vskip1*4+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
|
||||
add tmp, 32
|
||||
|
||||
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
pxor xp3, xgft3_hi ;xp3 += partial
|
||||
|
||||
pshufb xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
pxor xp4, xgft4_hi ;xp4 += partial
|
||||
|
||||
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
pxor xp5, xgft1_hi ;xp5 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
mov tmp, [dest+2*PS]
|
||||
mov ptr, [dest+3*PS]
|
||||
mov vec_i, [dest+4*PS]
|
||||
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
XSTR [tmp+pos], xp3
|
||||
XSTR [ptr+pos], xp4
|
||||
XSTR [vec_i+pos], xp5
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop16 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_5vect_dot_prod_sse, 00, 05, 0065
|
319
erasure_code/gf_5vect_dot_prod_sse_perf.c
Normal file
319
erasure_code/gf_5vect_dot_prod_sse_perf.c
Normal file
@ -0,0 +1,319 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_5vect_dot_prod_sse
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 10
|
||||
# define TEST_LEN 8*1024
|
||||
# define TEST_LOOPS 40000
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 10
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
|
||||
# define TEST_LOOPS 100
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j;
|
||||
void *buf;
|
||||
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
|
||||
u8 g4[TEST_SOURCES], g5[TEST_SOURCES], *g_tbls, *buffs[TEST_SOURCES];
|
||||
u8 *dest1, *dest2, *dest3, *dest4, *dest5, *dest_ref1, *dest_ref2;
|
||||
u8 *dest_ref3, *dest_ref4, *dest_ref5, *dest_ptrs[5];
|
||||
struct perf start, stop;
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 16, 6 * TEST_SOURCES * 32)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
g_tbls = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest4 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest5 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref4 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref5 = buf;
|
||||
|
||||
dest_ptrs[0] = dest1;
|
||||
dest_ptrs[1] = dest2;
|
||||
dest_ptrs[2] = dest3;
|
||||
dest_ptrs[3] = dest4;
|
||||
dest_ptrs[4] = dest5;
|
||||
|
||||
// Performance test
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
memset(dest1, 0, TEST_LEN);
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest3, 0, TEST_LEN);
|
||||
memset(dest4, 0, TEST_LEN);
|
||||
memset(dest5, 0, TEST_LEN);
|
||||
memset(dest_ref1, 0, TEST_LEN);
|
||||
memset(dest_ref2, 0, TEST_LEN);
|
||||
memset(dest_ref3, 0, TEST_LEN);
|
||||
memset(dest_ref4, 0, TEST_LEN);
|
||||
memset(dest_ref5, 0, TEST_LEN);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
g5[i] = rand();
|
||||
}
|
||||
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
|
||||
dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
|
||||
dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
|
||||
dest_ref4);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
|
||||
dest_ref5);
|
||||
|
||||
#ifdef DO_REF_PERF
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS / 20; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
|
||||
buffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
|
||||
buffs, dest_ref4);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
|
||||
buffs, dest_ref5);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("gf_5vect_dot_prod_base" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 5) * i);
|
||||
#endif
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 5) * i);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test4\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test5\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest5, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("pass perf check\n");
|
||||
return 0;
|
||||
|
||||
}
|
805
erasure_code/gf_5vect_dot_prod_sse_test.c
Normal file
805
erasure_code/gf_5vect_dot_prod_sse_test.c
Normal file
@ -0,0 +1,805 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_5vect_dot_prod_sse
|
||||
#endif
|
||||
#ifndef TEST_MIN_SIZE
|
||||
# define TEST_MIN_SIZE 16
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
#define TEST_MEM TEST_SIZE
|
||||
#define TEST_LOOPS 20000
|
||||
#define TEST_TYPE_STR ""
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 16
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 20
|
||||
#endif
|
||||
|
||||
#ifdef EC_ALIGNED_ADDR
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 0
|
||||
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
|
||||
#else
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 32
|
||||
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, srcs;
|
||||
void *buf;
|
||||
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
|
||||
u8 g4[TEST_SOURCES], g5[TEST_SOURCES], *g_tbls;
|
||||
u8 *dest1, *dest2, *dest3, *dest4, *dest5, *buffs[TEST_SOURCES];
|
||||
u8 *dest_ref1, *dest_ref2, *dest_ref3, *dest_ref4, *dest_ref5;
|
||||
u8 *dest_ptrs[5];
|
||||
|
||||
int align, size;
|
||||
unsigned char *efence_buffs[TEST_SOURCES];
|
||||
unsigned int offset;
|
||||
u8 *ubuffs[TEST_SOURCES];
|
||||
u8 *udest_ptrs[5];
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 16, 2 * (6 * TEST_SOURCES * 32))) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
g_tbls = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest4 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest5 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref4 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref5 = buf;
|
||||
|
||||
dest_ptrs[0] = dest1;
|
||||
dest_ptrs[1] = dest2;
|
||||
dest_ptrs[2] = dest3;
|
||||
dest_ptrs[3] = dest4;
|
||||
dest_ptrs[4] = dest5;
|
||||
|
||||
// Test of all zeros
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
memset(buffs[i], 0, TEST_LEN);
|
||||
|
||||
memset(dest1, 0, TEST_LEN);
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest3, 0, TEST_LEN);
|
||||
memset(dest4, 0, TEST_LEN);
|
||||
memset(dest5, 0, TEST_LEN);
|
||||
memset(dest_ref1, 0, TEST_LEN);
|
||||
memset(dest_ref2, 0, TEST_LEN);
|
||||
memset(dest_ref3, 0, TEST_LEN);
|
||||
memset(dest_ref4, 0, TEST_LEN);
|
||||
memset(dest_ref5, 0, TEST_LEN);
|
||||
memset(g1, 2, TEST_SOURCES);
|
||||
memset(g2, 1, TEST_SOURCES);
|
||||
memset(g3, 7, TEST_SOURCES);
|
||||
memset(g4, 9, TEST_SOURCES);
|
||||
memset(g5, 4, TEST_SOURCES);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]);
|
||||
gf_vect_mul_init(g5[i], &g_tbls[128 * TEST_SOURCES + i * 32]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
|
||||
dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
|
||||
dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
|
||||
dest_ref4);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
|
||||
dest_ref5);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test5\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest5, 25);
|
||||
return -1;
|
||||
}
|
||||
putchar('.');
|
||||
|
||||
// Rand data test
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
g5[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
|
||||
buffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
|
||||
buffs, dest_ref4);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
|
||||
buffs, dest_ref5);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest5, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Rand data test with varied parameters
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
g5[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
|
||||
dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
|
||||
dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs,
|
||||
dest_ref4);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[128 * srcs], buffs,
|
||||
dest_ref5);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test1 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test2 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test3 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test4 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test5 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest5, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
|
||||
efence_buffs[i] = buffs[i] + TEST_LEN - size;
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
g5[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref4);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref5);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref2, dest2, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref3, dest3, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref4, dest4, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref5, dest5, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest5, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test rand ptr alignment if available
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
|
||||
srcs = rand() % TEST_SOURCES;
|
||||
if (srcs == 0)
|
||||
continue;
|
||||
|
||||
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
|
||||
// Add random offsets
|
||||
for (i = 0; i < srcs; i++)
|
||||
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[4] = dest5 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
memset(dest1, 0, TEST_LEN); // zero pad to check write-over
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest3, 0, TEST_LEN);
|
||||
memset(dest4, 0, TEST_LEN);
|
||||
memset(dest5, 0, TEST_LEN);
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
ubuffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
g5[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], ubuffs, dest_ref5);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
|
||||
|
||||
if (memcmp(dest_ref1, udest_ptrs[0], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[0], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref2, udest_ptrs[1], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[1], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref3, udest_ptrs[2], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[2], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref4, udest_ptrs[3], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[3], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref5, udest_ptrs[4], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[4], 25);
|
||||
return -1;
|
||||
}
|
||||
// Confirm that padding around dests is unchanged
|
||||
memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
|
||||
offset = udest_ptrs[0] - dest1;
|
||||
|
||||
if (memcmp(dest1, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad1 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad1 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[1] - dest2;
|
||||
if (memcmp(dest2, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad2 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad2 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[2] - dest3;
|
||||
if (memcmp(dest3, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad3 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad3 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[3] - dest4;
|
||||
if (memcmp(dest4, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad4 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad4 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[4] - dest5;
|
||||
if (memcmp(dest5, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad5 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest5 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad5 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test all size alignment
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
|
||||
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
|
||||
srcs = TEST_SOURCES;
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
g5[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], buffs, dest_ref5);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (memcmp(dest_ref1, dest_ptrs[0], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[0], 25);
|
||||
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref2, dest_ptrs[1], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[1], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref3, dest_ptrs[2], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[2], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref4, dest_ptrs[3], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[3], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref5, dest_ptrs[4], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[4], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("Pass\n");
|
||||
return 0;
|
||||
|
||||
}
|
365
erasure_code/gf_5vect_mad_avx.asm
Normal file
365
erasure_code/gf_5vect_mad_avx.asm
Normal file
@ -0,0 +1,365 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13
|
||||
%define tmp4 r14
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*10 + 5*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
movdqa [rsp+16*0],xmm6
|
||||
movdqa [rsp+16*1],xmm7
|
||||
movdqa [rsp+16*2],xmm8
|
||||
movdqa [rsp+16*3],xmm9
|
||||
movdqa [rsp+16*4],xmm10
|
||||
movdqa [rsp+16*5],xmm11
|
||||
movdqa [rsp+16*6],xmm12
|
||||
movdqa [rsp+16*7],xmm13
|
||||
movdqa [rsp+16*8],xmm14
|
||||
movdqa [rsp+16*9],xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r13, 10*16 + 1*8
|
||||
save_reg r14, 10*16 + 2*8
|
||||
save_reg r15, 10*16 + 3*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp+16*0]
|
||||
movdqa xmm7, [rsp+16*1]
|
||||
movdqa xmm8, [rsp+16*2]
|
||||
movdqa xmm9, [rsp+16*3]
|
||||
movdqa xmm10, [rsp+16*4]
|
||||
movdqa xmm11, [rsp+16*5]
|
||||
movdqa xmm12, [rsp+16*6]
|
||||
movdqa xmm13, [rsp+16*7]
|
||||
movdqa xmm14, [rsp+16*8]
|
||||
movdqa xmm15, [rsp+16*9]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r13, [rsp + 10*16 + 1*8]
|
||||
mov r14, [rsp + 10*16 + 2*8]
|
||||
mov r15, [rsp + 10*16 + 3*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r12
|
||||
%define tmp4 r13
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 tmp4
|
||||
%define dest3 mul_array
|
||||
%define dest4 tmp2
|
||||
%define dest5 vec_i
|
||||
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft5_hi xmm14
|
||||
%define xgft4_lo xmm13
|
||||
%define xgft4_hi xmm12
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xtmph1 xmm2
|
||||
%define xtmpl1 xmm3
|
||||
%define xtmph2 xmm4
|
||||
%define xtmpl2 xmm5
|
||||
%define xtmph3 xmm6
|
||||
%define xtmpl3 xmm7
|
||||
%define xtmph5 xmm8
|
||||
%define xtmpl5 xmm9
|
||||
%define xd1 xmm10
|
||||
%define xd2 xmm11
|
||||
%define xd3 xtmpl1
|
||||
%define xd4 xtmph1
|
||||
%define xd5 xtmpl2
|
||||
|
||||
|
||||
align 16
|
||||
global gf_5vect_mad_avx:function
|
||||
func(gf_5vect_mad_avx)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
mov tmp, vec
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
lea tmp3, [mul_array + vec_i]
|
||||
sal tmp, 6 ;Multiply by 64
|
||||
vmovdqu xgft5_hi, [tmp3+2*tmp+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
|
||||
sal vec, 5 ;Multiply by 32
|
||||
add tmp, vec
|
||||
vmovdqu xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
|
||||
vmovdqu xgft4_lo, [tmp3+tmp] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
|
||||
|
||||
mov dest3, [dest1+2*PS] ; reuse mul_array
|
||||
mov dest4, [dest1+3*PS]
|
||||
mov dest5, [dest1+4*PS] ; reuse vec_i
|
||||
mov dest2, [dest1+PS]
|
||||
mov dest1, [dest1]
|
||||
|
||||
.loop16:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
|
||||
vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
vmovdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
vmovdqu xtmpl5, [tmp3+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
XLDR xd2, [dest2+pos] ;Get next dest vector
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl1, xtmpl1, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
|
||||
vpxor xd1, xd1, xtmph1
|
||||
|
||||
XLDR xd3, [dest3+pos] ;Reuse xtmpl1, Get next dest vector
|
||||
XLDR xd4, [dest4+pos] ;Reuse xtmph1, Get next dest vector
|
||||
|
||||
; dest2
|
||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl2, xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
|
||||
vpxor xd2, xd2, xtmph2
|
||||
|
||||
XLDR xd5, [dest5+pos] ;Reuse xtmpl2. Get next dest vector
|
||||
|
||||
; dest3
|
||||
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl3, xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials
|
||||
vpxor xd3, xd3, xtmph3
|
||||
|
||||
; dest4
|
||||
vpshufb xtmph2, xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl3, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xtmpl3 ;GF add high and low partials
|
||||
vpxor xd4, xd4, xtmph2
|
||||
|
||||
; dest5
|
||||
vpshufb xtmph5, xgft5_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl5, xtmpl5, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph5, xtmph5, xtmpl5 ;GF add high and low partials
|
||||
vpxor xd5, xd5, xtmph5
|
||||
|
||||
XSTR [dest1+pos], xd1 ;Store result into dest1
|
||||
XSTR [dest2+pos], xd2 ;Store result into dest2
|
||||
XSTR [dest3+pos], xd3 ;Store result into dest3
|
||||
XSTR [dest4+pos], xd4 ;Store result into dest4
|
||||
XSTR [dest5+pos], xd5 ;Store result into dest5
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
.lessthan16:
|
||||
;; Tail len
|
||||
;; Do one more overlap pass
|
||||
mov tmp, len ;Overlapped offset length-16
|
||||
XLDR x0, [src+tmp] ;Get next source vector
|
||||
|
||||
sub len, pos
|
||||
|
||||
vmovdqa xtmph1, [constip16] ;Load const of i + 16
|
||||
vpinsrb xtmph5, len.w, 15
|
||||
vpshufb xtmph5, xmask0f ;Broadcast len to all bytes
|
||||
vpcmpgtb xtmph5, xtmph5, xtmph1
|
||||
|
||||
vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
vmovdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
vmovdqu xtmpl5, [tmp3+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
|
||||
XLDR xd1, [dest1+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest2+tmp] ;Get next dest vector
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl1, xtmpl1, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
|
||||
vpand xtmph1, xtmph1, xtmph5
|
||||
vpxor xd1, xd1, xtmph1
|
||||
|
||||
XLDR xd3, [dest3+tmp] ;Reuse xtmpl1, Get next dest vector
|
||||
XLDR xd4, [dest4+tmp] ;Reuse xtmph1, Get next dest vector
|
||||
|
||||
; dest2
|
||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl2, xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
|
||||
vpand xtmph2, xtmph2, xtmph5
|
||||
vpxor xd2, xd2, xtmph2
|
||||
|
||||
XLDR xd5, [dest5+tmp] ;Reuse xtmpl2. Get next dest vector
|
||||
|
||||
; dest3
|
||||
vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl3, xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials
|
||||
vpand xtmph3, xtmph3, xtmph5
|
||||
vpxor xd3, xd3, xtmph3
|
||||
|
||||
; dest4
|
||||
vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
vpand xgft4_hi, xgft4_hi, xtmph5
|
||||
vpxor xd4, xd4, xgft4_hi
|
||||
|
||||
; dest5
|
||||
vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl5, xtmpl5, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft5_hi, xgft5_hi, xtmpl5 ;GF add high and low partials
|
||||
vpand xgft5_hi, xgft5_hi, xtmph5
|
||||
vpxor xd5, xd5, xgft5_hi
|
||||
|
||||
XSTR [dest1+tmp], xd1 ;Store result into dest1
|
||||
XSTR [dest2+tmp], xd2 ;Store result into dest2
|
||||
XSTR [dest3+tmp], xd3 ;Store result into dest3
|
||||
XSTR [dest4+tmp], xd4 ;Store result into dest4
|
||||
XSTR [dest5+tmp], xd5 ;Store result into dest5
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
constip16:
|
||||
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_5vect_mad_avx, 02, 01, 020d
|
363
erasure_code/gf_5vect_mad_avx2.asm
Normal file
363
erasure_code/gf_5vect_mad_avx2.asm
Normal file
@ -0,0 +1,363 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*10 + 3*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
movdqa [rsp+16*0],xmm6
|
||||
movdqa [rsp+16*1],xmm7
|
||||
movdqa [rsp+16*2],xmm8
|
||||
movdqa [rsp+16*3],xmm9
|
||||
movdqa [rsp+16*4],xmm10
|
||||
movdqa [rsp+16*5],xmm11
|
||||
movdqa [rsp+16*6],xmm12
|
||||
movdqa [rsp+16*7],xmm13
|
||||
movdqa [rsp+16*8],xmm14
|
||||
movdqa [rsp+16*9],xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r15, 10*16 + 1*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp+16*0]
|
||||
movdqa xmm7, [rsp+16*1]
|
||||
movdqa xmm8, [rsp+16*2]
|
||||
movdqa xmm9, [rsp+16*3]
|
||||
movdqa xmm10, [rsp+16*4]
|
||||
movdqa xmm11, [rsp+16*5]
|
||||
movdqa xmm12, [rsp+16*6]
|
||||
movdqa xmm13, [rsp+16*7]
|
||||
movdqa xmm14, [rsp+16*8]
|
||||
movdqa xmm15, [rsp+16*9]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r15, [rsp + 10*16 + 1*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 tmp2
|
||||
%define dest3 mul_array
|
||||
%define dest4 vec
|
||||
%define dest5 vec_i
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f ymm15
|
||||
%define xmask0fx xmm15
|
||||
%define xgft1_lo ymm14
|
||||
%define xgft2_lo ymm13
|
||||
%define xgft3_lo ymm12
|
||||
%define xgft4_lo ymm11
|
||||
%define xgft5_lo ymm10
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xtmpl ymm2
|
||||
%define xtmplx xmm2
|
||||
%define xtmph1 ymm3
|
||||
%define xtmph1x xmm3
|
||||
%define xtmph2 ymm4
|
||||
%define xd1 ymm5
|
||||
%define xd2 ymm6
|
||||
%define xd3 ymm7
|
||||
%define xd4 ymm8
|
||||
%define xd5 ymm9
|
||||
|
||||
align 16
|
||||
global gf_5vect_mad_avx2:function
|
||||
func(gf_5vect_mad_avx2)
|
||||
FUNC_SAVE
|
||||
sub len, 32
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
mov tmp.b, 0x0f
|
||||
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
|
||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
sal vec, 5 ;Multiply by 32
|
||||
lea tmp, [mul_array + vec_i]
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
vmovdqu xgft5_lo, [tmp+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
; " Ex{00}, Ex{10}, ..., Ex{f0}
|
||||
add tmp, vec
|
||||
vmovdqu xgft4_lo, [tmp+2*vec] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
; " Dx{00}, Dx{10}, ..., Dx{f0}
|
||||
|
||||
mov dest3, [dest1+2*PS] ; reuse mul_array
|
||||
mov dest4, [dest1+3*PS] ; reuse vec
|
||||
mov dest5, [dest1+4*PS] ; reuse vec_i
|
||||
mov dest2, [dest1+PS]
|
||||
mov dest1, [dest1]
|
||||
|
||||
.loop32:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
XLDR xd2, [dest2+pos] ;Get next dest vector
|
||||
XLDR xd3, [dest3+pos] ;Get next dest vector
|
||||
XLDR xd4, [dest4+pos] ;Get next dest vector
|
||||
XLDR xd5, [dest5+pos] ;Get next dest vector
|
||||
|
||||
vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
||||
vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
||||
|
||||
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
||||
|
||||
; dest1
|
||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials
|
||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
||||
|
||||
vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
||||
; dest2
|
||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials
|
||||
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
||||
|
||||
vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
||||
; dest3
|
||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials
|
||||
vpxor xd3, xd3, xtmph1 ;xd3 += partial
|
||||
|
||||
vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
|
||||
; dest4
|
||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials
|
||||
vpxor xd4, xd4, xtmph2 ;xd4 += partial
|
||||
|
||||
; dest5
|
||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft5_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials
|
||||
vpxor xd5, xd5, xtmph1 ;xd5 += partial
|
||||
|
||||
XSTR [dest1+pos], xd1
|
||||
XSTR [dest2+pos], xd2
|
||||
XSTR [dest3+pos], xd3
|
||||
XSTR [dest4+pos], xd4
|
||||
XSTR [dest5+pos], xd5
|
||||
|
||||
add pos, 32 ;Loop on 32 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop32
|
||||
|
||||
lea tmp, [len + 32]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
.lessthan32:
|
||||
;; Tail len
|
||||
;; Do one more overlap pass
|
||||
mov tmp.b, 0x1f
|
||||
vpinsrb xtmph1x, xtmph1x, tmp.w, 0
|
||||
vpbroadcastb xtmph1, xtmph1x ;Construct mask 0x1f1f1f...
|
||||
|
||||
mov tmp, len ;Overlapped offset length-32
|
||||
|
||||
XLDR x0, [src+tmp] ;Get next source vector
|
||||
|
||||
XLDR xd1, [dest1+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest2+tmp] ;Get next dest vector
|
||||
XLDR xd3, [dest3+tmp] ;Get next dest vector
|
||||
XLDR xd4, [dest4+tmp] ;Get next dest vector
|
||||
XLDR xd5, [dest5+tmp] ;Get next dest vector
|
||||
|
||||
sub len, pos
|
||||
|
||||
vmovdqa xtmph2, [constip32] ;Load const of i + 32
|
||||
vpinsrb xtmplx, xtmplx, len.w, 15
|
||||
vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
|
||||
vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f...
|
||||
vpcmpgtb xtmpl, xtmpl, xtmph2
|
||||
|
||||
vpand xtmph1, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
vperm2i128 xtmpa, xtmph1, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
||||
vperm2i128 x0, xtmph1, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
||||
|
||||
vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
||||
|
||||
; dest1
|
||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xgft1_lo ;GF add high and low partials
|
||||
vpand xtmph1, xtmph1, xtmpl
|
||||
vpxor xd1, xd1, xtmph1 ;xd1 += partial
|
||||
|
||||
vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
||||
; dest2
|
||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xgft2_lo ;GF add high and low partials
|
||||
vpand xtmph2, xtmph2, xtmpl
|
||||
vpxor xd2, xd2, xtmph2 ;xd2 += partial
|
||||
|
||||
vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
||||
; dest3
|
||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xgft3_lo ;GF add high and low partials
|
||||
vpand xtmph1, xtmph1, xtmpl
|
||||
vpxor xd3, xd3, xtmph1 ;xd3 += partial
|
||||
|
||||
vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
|
||||
; dest4
|
||||
vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xgft4_lo ;GF add high and low partials
|
||||
vpand xtmph2, xtmph2, xtmpl
|
||||
vpxor xd4, xd4, xtmph2 ;xd4 += partial
|
||||
|
||||
; dest5
|
||||
vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xgft5_lo ;GF add high and low partials
|
||||
vpand xtmph1, xtmph1, xtmpl
|
||||
vpxor xd5, xd5, xtmph1 ;xd5 += partial
|
||||
|
||||
XSTR [dest1+tmp], xd1
|
||||
XSTR [dest2+tmp], xd2
|
||||
XSTR [dest3+tmp], xd3
|
||||
XSTR [dest4+tmp], xd4
|
||||
XSTR [dest5+tmp], xd5
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
align 32
|
||||
constip32:
|
||||
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
|
||||
ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_5vect_mad_avx2, 04, 01, 020e
|
373
erasure_code/gf_5vect_mad_sse.asm
Normal file
373
erasure_code/gf_5vect_mad_sse.asm
Normal file
@ -0,0 +1,373 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_5vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13
|
||||
%define tmp4 r14
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*10 + 5*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
movdqa [rsp+16*0],xmm6
|
||||
movdqa [rsp+16*1],xmm7
|
||||
movdqa [rsp+16*2],xmm8
|
||||
movdqa [rsp+16*3],xmm9
|
||||
movdqa [rsp+16*4],xmm10
|
||||
movdqa [rsp+16*5],xmm11
|
||||
movdqa [rsp+16*6],xmm12
|
||||
movdqa [rsp+16*7],xmm13
|
||||
movdqa [rsp+16*8],xmm14
|
||||
movdqa [rsp+16*9],xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r13, 10*16 + 1*8
|
||||
save_reg r14, 10*16 + 2*8
|
||||
save_reg r15, 10*16 + 3*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp+16*0]
|
||||
movdqa xmm7, [rsp+16*1]
|
||||
movdqa xmm8, [rsp+16*2]
|
||||
movdqa xmm9, [rsp+16*3]
|
||||
movdqa xmm10, [rsp+16*4]
|
||||
movdqa xmm11, [rsp+16*5]
|
||||
movdqa xmm12, [rsp+16*6]
|
||||
movdqa xmm13, [rsp+16*7]
|
||||
movdqa xmm14, [rsp+16*8]
|
||||
movdqa xmm15, [rsp+16*9]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r13, [rsp + 10*16 + 1*8]
|
||||
mov r14, [rsp + 10*16 + 2*8]
|
||||
mov r15, [rsp + 10*16 + 3*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r12
|
||||
%define tmp4 r13
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
;;; gf_5vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 tmp4
|
||||
%define dest3 mul_array
|
||||
%define dest4 tmp2
|
||||
%define dest5 vec_i
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR movdqu
|
||||
%define XSTR movdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR movdqa
|
||||
%define XSTR movdqa
|
||||
%else
|
||||
%define XLDR movntdqa
|
||||
%define XSTR movntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft5_hi xmm14
|
||||
%define xgft4_lo xmm13
|
||||
%define xgft4_hi xmm12
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xtmph1 xmm2
|
||||
%define xtmpl1 xmm3
|
||||
%define xtmph2 xmm4
|
||||
%define xtmpl2 xmm5
|
||||
%define xtmph3 xmm6
|
||||
%define xtmpl3 xmm7
|
||||
%define xtmph5 xmm8
|
||||
%define xtmpl5 xmm9
|
||||
%define xd1 xmm10
|
||||
%define xd2 xmm11
|
||||
%define xd3 xtmpl1
|
||||
%define xd4 xtmph1
|
||||
%define xd5 xtmpl2
|
||||
|
||||
|
||||
align 16
|
||||
global gf_5vect_mad_sse:function
|
||||
func(gf_5vect_mad_sse)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
mov tmp, vec
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
lea tmp3, [mul_array + vec_i]
|
||||
sal tmp, 6 ;Multiply by 64
|
||||
movdqu xgft5_hi, [tmp3+2*tmp+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
|
||||
sal vec, 5 ;Multiply by 32
|
||||
add tmp, vec
|
||||
movdqu xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
|
||||
movdqu xgft4_lo, [tmp3+tmp] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
|
||||
|
||||
mov dest3, [dest1+2*PS] ; reuse mul_array
|
||||
mov dest4, [dest1+3*PS]
|
||||
mov dest5, [dest1+4*PS] ; reuse vec_i
|
||||
mov dest2, [dest1+PS]
|
||||
mov dest1, [dest1]
|
||||
|
||||
.loop16:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
|
||||
movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
movdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
movdqu xtmpl5, [tmp3+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
movdqa xtmph5, xgft5_hi ;Reload const array registers
|
||||
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
XLDR xd2, [dest2+pos] ;Get next dest vector
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
pshufb xtmph1, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph1, xtmpl1 ;GF add high and low partials
|
||||
pxor xd1, xtmph1
|
||||
|
||||
XLDR xd3, [dest3+pos] ;Reuse xtmpl1, Get next dest vector
|
||||
XLDR xd4, [dest4+pos] ;Reuse xtmph1. Get next dest vector
|
||||
|
||||
; dest2
|
||||
pshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph2, xtmpl2 ;GF add high and low partials
|
||||
pxor xd2, xtmph2
|
||||
|
||||
XLDR xd5, [dest5+pos] ;Reuse xtmpl2. Get next dest vector
|
||||
|
||||
; dest3
|
||||
pshufb xtmph3, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph3, xtmpl3 ;GF add high and low partials
|
||||
pxor xd3, xtmph3
|
||||
|
||||
movdqa xtmph2, xgft4_hi ;Reload const array registers
|
||||
movdqa xtmpl3, xgft4_lo ;Reload const array registers
|
||||
|
||||
; dest5
|
||||
pshufb xtmph5, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl5, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph5, xtmpl5 ;GF add high and low partials
|
||||
pxor xd5, xtmph5
|
||||
|
||||
; dest4
|
||||
pshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph2, xtmpl3 ;GF add high and low partials
|
||||
pxor xd4, xtmph2
|
||||
|
||||
XSTR [dest1+pos], xd1 ;Store result into dest1
|
||||
XSTR [dest2+pos], xd2 ;Store result into dest2
|
||||
XSTR [dest3+pos], xd3 ;Store result into dest3
|
||||
XSTR [dest4+pos], xd4 ;Store result into dest4
|
||||
XSTR [dest5+pos], xd5 ;Store result into dest5
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
.lessthan16:
|
||||
;; Tail len
|
||||
;; Do one more overlap pass
|
||||
mov tmp, len ;Overlapped offset length-16
|
||||
XLDR x0, [src+tmp] ;Get next source vector
|
||||
|
||||
sub len, pos
|
||||
|
||||
movdqa xtmpl1, [constip16] ;Load const of i + 16
|
||||
pinsrb xtmph5, len.w, 15
|
||||
pshufb xtmph5, xmask0f ;Broadcast len to all bytes
|
||||
pcmpgtb xtmph5, xtmpl1
|
||||
|
||||
movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
movdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
movdqu xtmpl5, [tmp3+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
|
||||
XLDR xd1, [dest1+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest2+tmp] ;Get next dest vector
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
; dest1
|
||||
pshufb xtmph1, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph1, xtmpl1 ;GF add high and low partials
|
||||
pand xtmph1, xtmph5
|
||||
pxor xd1, xtmph1
|
||||
|
||||
XLDR xd3, [dest3+tmp] ;Reuse xtmpl1, Get next dest vector
|
||||
XLDR xd4, [dest4+tmp] ;Reuse xtmph1. Get next dest vector
|
||||
|
||||
; dest2
|
||||
pshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph2, xtmpl2 ;GF add high and low partials
|
||||
pand xtmph2, xtmph5
|
||||
pxor xd2, xtmph2
|
||||
|
||||
XLDR xd5, [dest5+tmp] ;Reuse xtmpl2. Get next dest vector
|
||||
|
||||
; dest3
|
||||
pshufb xtmph3, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph3, xtmpl3 ;GF add high and low partials
|
||||
pand xtmph3, xtmph5
|
||||
pxor xd3, xtmph3
|
||||
|
||||
; dest4
|
||||
pshufb xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
pand xgft4_hi, xtmph5
|
||||
pxor xd4, xgft4_hi
|
||||
|
||||
; dest5
|
||||
pshufb xgft5_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl5, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft5_hi, xtmpl5 ;GF add high and low partials
|
||||
pand xgft5_hi, xtmph5
|
||||
pxor xd5, xgft5_hi
|
||||
|
||||
XSTR [dest1+tmp], xd1 ;Store result into dest1
|
||||
XSTR [dest2+tmp], xd2 ;Store result into dest2
|
||||
XSTR [dest3+tmp], xd3 ;Store result into dest3
|
||||
XSTR [dest4+tmp], xd4 ;Store result into dest4
|
||||
XSTR [dest5+tmp], xd5 ;Store result into dest5
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
|
||||
mask0f:
|
||||
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
constip16:
|
||||
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_5vect_mad_sse, 00, 01, 020c
|
315
erasure_code/gf_6vect_dot_prod_avx.asm
Normal file
315
erasure_code/gf_6vect_dot_prod_avx.asm
Normal file
@ -0,0 +1,315 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_6vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define tmp5 r14 ; must be saved and restored
|
||||
%define tmp6 r15 ; must be saved and restored
|
||||
%define return rax
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define tmp5 rdi ; must be saved and restored
|
||||
%define tmp6 rsi ; must be saved and restored
|
||||
%define return rax
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
save_xmm128 xmm6, 0*16
|
||||
save_xmm128 xmm7, 1*16
|
||||
save_xmm128 xmm8, 2*16
|
||||
save_xmm128 xmm9, 3*16
|
||||
save_xmm128 xmm10, 4*16
|
||||
save_xmm128 xmm11, 5*16
|
||||
save_xmm128 xmm12, 6*16
|
||||
save_xmm128 xmm13, 7*16
|
||||
save_xmm128 xmm14, 8*16
|
||||
save_xmm128 xmm15, 9*16
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r13, 10*16 + 1*8
|
||||
save_reg r14, 10*16 + 2*8
|
||||
save_reg r15, 10*16 + 3*8
|
||||
save_reg rdi, 10*16 + 4*8
|
||||
save_reg rsi, 10*16 + 5*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
vmovdqa xmm9, [rsp + 3*16]
|
||||
vmovdqa xmm10, [rsp + 4*16]
|
||||
vmovdqa xmm11, [rsp + 5*16]
|
||||
vmovdqa xmm12, [rsp + 6*16]
|
||||
vmovdqa xmm13, [rsp + 7*16]
|
||||
vmovdqa xmm14, [rsp + 8*16]
|
||||
vmovdqa xmm15, [rsp + 9*16]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r13, [rsp + 10*16 + 1*8]
|
||||
mov r14, [rsp + 10*16 + 2*8]
|
||||
mov r15, [rsp + 10*16 + 3*8]
|
||||
mov rdi, [rsp + 10*16 + 4*8]
|
||||
mov rsi, [rsp + 10*16 + 5*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest1 tmp3
|
||||
%define dest2 tmp4
|
||||
%define vskip1 tmp5
|
||||
%define vskip3 tmp6
|
||||
%define pos return
|
||||
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft1_lo xmm14
|
||||
%define xgft1_hi xmm13
|
||||
%define xgft2_lo xmm12
|
||||
%define xgft2_hi xmm11
|
||||
%define xgft3_lo xmm10
|
||||
%define xgft3_hi xmm9
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm2
|
||||
%define xp2 xmm3
|
||||
%define xp3 xmm4
|
||||
%define xp4 xmm5
|
||||
%define xp5 xmm6
|
||||
%define xp6 xmm7
|
||||
|
||||
align 16
|
||||
global gf_6vect_dot_prod_avx:function
|
||||
func(gf_6vect_dot_prod_avx)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
mov vskip1, vec
|
||||
imul vskip1, 32
|
||||
mov vskip3, vec
|
||||
imul vskip3, 96
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
mov dest1, [dest]
|
||||
mov dest2, [dest+PS]
|
||||
|
||||
|
||||
.loop16:
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
vpxor xp1, xp1
|
||||
vpxor xp2, xp2
|
||||
vpxor xp3, xp3
|
||||
vpxor xp4, xp4
|
||||
vpxor xp5, xp5
|
||||
vpxor xp6, xp6
|
||||
|
||||
.next_vect:
|
||||
mov ptr, [src+vec_i]
|
||||
add vec_i, PS
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
vmovdqu xgft2_hi, [tmp+vskip1*1+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
vmovdqu xgft3_hi, [tmp+vskip1*2+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
lea ptr, [vskip1 + vskip1*4] ;ptr = vskip5
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
vpxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
vpxor xp3, xgft3_hi ;xp3 += partial
|
||||
|
||||
|
||||
vmovdqu xgft1_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
vmovdqu xgft1_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
vmovdqu xgft2_hi, [tmp+vskip1*4+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
|
||||
vmovdqu xgft3_lo, [tmp+ptr] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
|
||||
vmovdqu xgft3_hi, [tmp+ptr+16] ; " Fx{00}, Fx{10}, ..., Fx{f0}
|
||||
add tmp, 32
|
||||
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp4, xgft1_hi ;xp4 += partial
|
||||
|
||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
vpxor xp5, xgft2_hi ;xp5 += partial
|
||||
|
||||
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
vpxor xp6, xgft3_hi ;xp6 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
|
||||
mov tmp, [dest+2*PS]
|
||||
mov ptr, [dest+3*PS]
|
||||
mov vec_i, [dest+4*PS]
|
||||
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
XSTR [tmp+pos], xp3
|
||||
mov tmp, [dest+5*PS]
|
||||
XSTR [ptr+pos], xp4
|
||||
XSTR [vec_i+pos], xp5
|
||||
XSTR [tmp+pos], xp6
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop16 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_6vect_dot_prod_avx, 02, 04, 0195
|
326
erasure_code/gf_6vect_dot_prod_avx2.asm
Normal file
326
erasure_code/gf_6vect_dot_prod_avx2.asm
Normal file
@ -0,0 +1,326 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_6vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define tmp5 r14 ; must be saved and restored
|
||||
%define tmp6 r15 ; must be saved and restored
|
||||
%define return rax
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define tmp5 rdi ; must be saved and restored
|
||||
%define tmp6 rsi ; must be saved and restored
|
||||
%define return rax
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
vmovdqa [rsp + 0*16], xmm6
|
||||
vmovdqa [rsp + 1*16], xmm7
|
||||
vmovdqa [rsp + 2*16], xmm8
|
||||
vmovdqa [rsp + 3*16], xmm9
|
||||
vmovdqa [rsp + 4*16], xmm10
|
||||
vmovdqa [rsp + 5*16], xmm11
|
||||
vmovdqa [rsp + 6*16], xmm12
|
||||
vmovdqa [rsp + 7*16], xmm13
|
||||
vmovdqa [rsp + 8*16], xmm14
|
||||
vmovdqa [rsp + 9*16], xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r13, 10*16 + 1*8
|
||||
save_reg r14, 10*16 + 2*8
|
||||
save_reg r15, 10*16 + 3*8
|
||||
save_reg rdi, 10*16 + 4*8
|
||||
save_reg rsi, 10*16 + 5*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm8, [rsp + 2*16]
|
||||
vmovdqa xmm9, [rsp + 3*16]
|
||||
vmovdqa xmm10, [rsp + 4*16]
|
||||
vmovdqa xmm11, [rsp + 5*16]
|
||||
vmovdqa xmm12, [rsp + 6*16]
|
||||
vmovdqa xmm13, [rsp + 7*16]
|
||||
vmovdqa xmm14, [rsp + 8*16]
|
||||
vmovdqa xmm15, [rsp + 9*16]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r13, [rsp + 10*16 + 1*8]
|
||||
mov r14, [rsp + 10*16 + 2*8]
|
||||
mov r15, [rsp + 10*16 + 3*8]
|
||||
mov rdi, [rsp + 10*16 + 4*8]
|
||||
mov rsi, [rsp + 10*16 + 5*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest1 tmp3
|
||||
%define dest2 tmp4
|
||||
%define vskip1 tmp5
|
||||
%define vskip3 tmp6
|
||||
%define pos return
|
||||
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f ymm15
|
||||
%define xmask0fx xmm15
|
||||
%define xgft1_lo ymm14
|
||||
%define xgft1_hi ymm13
|
||||
%define xgft2_lo ymm12
|
||||
%define xgft2_hi ymm11
|
||||
%define xgft3_lo ymm10
|
||||
%define xgft3_hi ymm9
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xp1 ymm2
|
||||
%define xp2 ymm3
|
||||
%define xp3 ymm4
|
||||
%define xp4 ymm5
|
||||
%define xp5 ymm6
|
||||
%define xp6 ymm7
|
||||
|
||||
align 16
|
||||
global gf_6vect_dot_prod_avx2:function
|
||||
func(gf_6vect_dot_prod_avx2)
|
||||
FUNC_SAVE
|
||||
sub len, 32
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
mov tmp.b, 0x0f
|
||||
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
|
||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||
mov vskip1, vec
|
||||
imul vskip1, 32
|
||||
mov vskip3, vec
|
||||
imul vskip3, 96
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
mov dest1, [dest]
|
||||
mov dest2, [dest+PS]
|
||||
|
||||
|
||||
.loop32:
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
vpxor xp1, xp1
|
||||
vpxor xp2, xp2
|
||||
vpxor xp3, xp3
|
||||
vpxor xp4, xp4
|
||||
vpxor xp5, xp5
|
||||
vpxor xp6, xp6
|
||||
|
||||
.next_vect:
|
||||
mov ptr, [src+vec_i]
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
add vec_i, PS
|
||||
|
||||
vpand xgft3_lo, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
vperm2i128 xtmpa, xgft3_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
||||
vperm2i128 x0, xgft3_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vmovdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
lea ptr, [vskip1 + vskip1*4] ;ptr = vskip5
|
||||
|
||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
vpxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
vpxor xp3, xgft3_hi ;xp3 += partial
|
||||
|
||||
|
||||
vmovdqu xgft1_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
; " Dx{00}, Dx{10}, ..., Dx{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
; " Ex{00}, Ex{10}, ..., Ex{f0}
|
||||
vmovdqu xgft3_lo, [tmp+ptr] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
|
||||
; " Fx{00}, Fx{10}, ..., Fx{f0}
|
||||
add tmp, 32
|
||||
vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
||||
vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
||||
|
||||
vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
vpxor xp4, xgft1_hi ;xp4 += partial
|
||||
|
||||
vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
vpxor xp5, xgft2_hi ;xp5 += partial
|
||||
|
||||
vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
vpxor xp6, xgft3_hi ;xp6 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
|
||||
mov tmp, [dest+2*PS]
|
||||
mov ptr, [dest+3*PS]
|
||||
mov vec_i, [dest+4*PS]
|
||||
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
XSTR [tmp+pos], xp3
|
||||
mov tmp, [dest+5*PS]
|
||||
XSTR [ptr+pos], xp4
|
||||
XSTR [vec_i+pos], xp5
|
||||
XSTR [tmp+pos], xp6
|
||||
|
||||
add pos, 32 ;Loop on 32 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop32
|
||||
|
||||
lea tmp, [len + 32]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop32 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_6vect_dot_prod_avx2, 04, 04, 019a
|
315
erasure_code/gf_6vect_dot_prod_sse.asm
Normal file
315
erasure_code/gf_6vect_dot_prod_sse.asm
Normal file
@ -0,0 +1,315 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_6vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r12 ; must be saved and restored
|
||||
%define tmp5 r14 ; must be saved and restored
|
||||
%define tmp6 r15 ; must be saved and restored
|
||||
%define return rax
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved, loaded and restored
|
||||
%define arg5 r15 ; must be saved and restored
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13 ; must be saved and restored
|
||||
%define tmp4 r14 ; must be saved and restored
|
||||
%define tmp5 rdi ; must be saved and restored
|
||||
%define tmp6 rsi ; must be saved and restored
|
||||
%define return rax
|
||||
%define PS 8
|
||||
%define LOG_PS 3
|
||||
%define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
save_xmm128 xmm6, 0*16
|
||||
save_xmm128 xmm7, 1*16
|
||||
save_xmm128 xmm8, 2*16
|
||||
save_xmm128 xmm9, 3*16
|
||||
save_xmm128 xmm10, 4*16
|
||||
save_xmm128 xmm11, 5*16
|
||||
save_xmm128 xmm12, 6*16
|
||||
save_xmm128 xmm13, 7*16
|
||||
save_xmm128 xmm14, 8*16
|
||||
save_xmm128 xmm15, 9*16
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r13, 10*16 + 1*8
|
||||
save_reg r14, 10*16 + 2*8
|
||||
save_reg r15, 10*16 + 3*8
|
||||
save_reg rdi, 10*16 + 4*8
|
||||
save_reg rsi, 10*16 + 5*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp + 0*16]
|
||||
movdqa xmm7, [rsp + 1*16]
|
||||
movdqa xmm8, [rsp + 2*16]
|
||||
movdqa xmm9, [rsp + 3*16]
|
||||
movdqa xmm10, [rsp + 4*16]
|
||||
movdqa xmm11, [rsp + 5*16]
|
||||
movdqa xmm12, [rsp + 6*16]
|
||||
movdqa xmm13, [rsp + 7*16]
|
||||
movdqa xmm14, [rsp + 8*16]
|
||||
movdqa xmm15, [rsp + 9*16]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r13, [rsp + 10*16 + 1*8]
|
||||
mov r14, [rsp + 10*16 + 2*8]
|
||||
mov r15, [rsp + 10*16 + 3*8]
|
||||
mov rdi, [rsp + 10*16 + 4*8]
|
||||
mov rsi, [rsp + 10*16 + 5*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest arg4
|
||||
%define ptr arg5
|
||||
%define vec_i tmp2
|
||||
%define dest1 tmp3
|
||||
%define dest2 tmp4
|
||||
%define vskip1 tmp5
|
||||
%define vskip3 tmp6
|
||||
%define pos return
|
||||
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR movdqu
|
||||
%define XSTR movdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR movdqa
|
||||
%define XSTR movdqa
|
||||
%else
|
||||
%define XLDR movntdqa
|
||||
%define XSTR movntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft1_lo xmm2
|
||||
%define xgft1_hi xmm3
|
||||
%define xgft2_lo xmm4
|
||||
%define xgft2_hi xmm5
|
||||
%define xgft3_lo xmm6
|
||||
%define xgft3_hi xmm7
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp1 xmm8
|
||||
%define xp2 xmm9
|
||||
%define xp3 xmm10
|
||||
%define xp4 xmm11
|
||||
%define xp5 xmm12
|
||||
%define xp6 xmm13
|
||||
|
||||
align 16
|
||||
global gf_6vect_dot_prod_sse:function
|
||||
func(gf_6vect_dot_prod_sse)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
mov vskip1, vec
|
||||
imul vskip1, 32
|
||||
mov vskip3, vec
|
||||
imul vskip3, 96
|
||||
sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
|
||||
mov dest1, [dest]
|
||||
mov dest2, [dest+PS]
|
||||
|
||||
|
||||
.loop16:
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
pxor xp1, xp1
|
||||
pxor xp2, xp2
|
||||
pxor xp3, xp3
|
||||
pxor xp4, xp4
|
||||
pxor xp5, xp5
|
||||
pxor xp6, xp6
|
||||
|
||||
.next_vect:
|
||||
mov ptr, [src+vec_i]
|
||||
add vec_i, PS
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
|
||||
movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
movdqu xgft2_lo, [tmp+vskip1*1] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
movdqu xgft2_hi, [tmp+vskip1*1+16] ; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
movdqu xgft3_lo, [tmp+vskip1*2] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
movdqu xgft3_hi, [tmp+vskip1*2+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
lea ptr, [vskip1 + vskip1*4] ;ptr = vskip5
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
pxor xp1, xgft1_hi ;xp1 += partial
|
||||
|
||||
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
pxor xp2, xgft2_hi ;xp2 += partial
|
||||
|
||||
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
pxor xp3, xgft3_hi ;xp3 += partial
|
||||
|
||||
|
||||
movdqu xgft1_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
movdqu xgft1_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0}
|
||||
movdqu xgft2_lo, [tmp+vskip1*4] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
movdqu xgft2_hi, [tmp+vskip1*4+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
|
||||
movdqu xgft3_lo, [tmp+ptr] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
|
||||
movdqu xgft3_hi, [tmp+ptr+16] ; " Fx{00}, Fx{10}, ..., Fx{f0}
|
||||
add tmp, 32
|
||||
|
||||
|
||||
pshufb xgft1_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft1_hi, xgft1_lo ;GF add high and low partials
|
||||
pxor xp4, xgft1_hi ;xp4 += partial
|
||||
|
||||
pshufb xgft2_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft2_hi, xgft2_lo ;GF add high and low partials
|
||||
pxor xp5, xgft2_hi ;xp5 += partial
|
||||
|
||||
pshufb xgft3_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft3_hi, xgft3_lo ;GF add high and low partials
|
||||
pxor xp6, xgft3_hi ;xp6 += partial
|
||||
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
|
||||
mov tmp, [dest+2*PS]
|
||||
mov ptr, [dest+3*PS]
|
||||
mov vec_i, [dest+4*PS]
|
||||
|
||||
XSTR [dest1+pos], xp1
|
||||
XSTR [dest2+pos], xp2
|
||||
XSTR [tmp+pos], xp3
|
||||
mov tmp, [dest+5*PS]
|
||||
XSTR [ptr+pos], xp4
|
||||
XSTR [vec_i+pos], xp5
|
||||
XSTR [tmp+pos], xp6
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop16 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_6vect_dot_prod_sse, 00, 05, 0066
|
352
erasure_code/gf_6vect_dot_prod_sse_perf.c
Normal file
352
erasure_code/gf_6vect_dot_prod_sse_perf.c
Normal file
@ -0,0 +1,352 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_6vect_dot_prod_sse
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 10
|
||||
# define TEST_LEN 8*1024
|
||||
# define TEST_LOOPS 40000
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 10
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
|
||||
# define TEST_LOOPS 100
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j;
|
||||
void *buf;
|
||||
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
|
||||
u8 g4[TEST_SOURCES], g5[TEST_SOURCES], g6[TEST_SOURCES], *g_tbls;
|
||||
u8 *dest1, *dest2, *dest3, *dest4, *dest5, *dest6, *dest_ref1;
|
||||
u8 *dest_ref2, *dest_ref3, *dest_ref4, *dest_ref5, *dest_ref6;
|
||||
u8 *dest_ptrs[6], *buffs[TEST_SOURCES];
|
||||
struct perf start, stop;
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 16, 6 * TEST_SOURCES * 32)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
g_tbls = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest4 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest5 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest6 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref4 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref5 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref6 = buf;
|
||||
|
||||
dest_ptrs[0] = dest1;
|
||||
dest_ptrs[1] = dest2;
|
||||
dest_ptrs[2] = dest3;
|
||||
dest_ptrs[3] = dest4;
|
||||
dest_ptrs[4] = dest5;
|
||||
dest_ptrs[5] = dest6;
|
||||
|
||||
// Performance test
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
memset(dest1, 0, TEST_LEN);
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest3, 0, TEST_LEN);
|
||||
memset(dest4, 0, TEST_LEN);
|
||||
memset(dest5, 0, TEST_LEN);
|
||||
memset(dest6, 0, TEST_LEN);
|
||||
memset(dest_ref1, 0, TEST_LEN);
|
||||
memset(dest_ref2, 0, TEST_LEN);
|
||||
memset(dest_ref3, 0, TEST_LEN);
|
||||
memset(dest_ref4, 0, TEST_LEN);
|
||||
memset(dest_ref5, 0, TEST_LEN);
|
||||
memset(dest_ref6, 0, TEST_LEN);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
g5[i] = rand();
|
||||
g6[i] = rand();
|
||||
}
|
||||
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g6[j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
|
||||
dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
|
||||
dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
|
||||
dest_ref4);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
|
||||
dest_ref5);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES], buffs,
|
||||
dest_ref6);
|
||||
|
||||
#ifdef DO_REF_PERF
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS / 20; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g6[j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
|
||||
buffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
|
||||
buffs, dest_ref4);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
|
||||
buffs, dest_ref5);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES],
|
||||
buffs, dest_ref6);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("gf_6vect_dot_prod_base" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 6) * i);
|
||||
#endif
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf_vect_mul_init(g1[j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(g2[j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g3[j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g4[j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g5[j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(g6[j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
|
||||
}
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 6) * i);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test2\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test3\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test4\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test5\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest5, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test6\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref6, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest6, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("pass perf check\n");
|
||||
return 0;
|
||||
|
||||
}
|
911
erasure_code/gf_6vect_dot_prod_sse_test.c
Normal file
911
erasure_code/gf_6vect_dot_prod_sse_test.c
Normal file
@ -0,0 +1,911 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_6vect_dot_prod_sse
|
||||
#endif
|
||||
#ifndef TEST_MIN_SIZE
|
||||
# define TEST_MIN_SIZE 16
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
#define TEST_MEM TEST_SIZE
|
||||
#define TEST_LOOPS 20000
|
||||
#define TEST_TYPE_STR ""
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 16
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 20
|
||||
#endif
|
||||
|
||||
#ifdef EC_ALIGNED_ADDR
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 0
|
||||
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
|
||||
#else
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 32
|
||||
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, srcs;
|
||||
void *buf;
|
||||
u8 g1[TEST_SOURCES], g2[TEST_SOURCES], g3[TEST_SOURCES];
|
||||
u8 g4[TEST_SOURCES], g5[TEST_SOURCES], g6[TEST_SOURCES], *g_tbls;
|
||||
u8 *dest1, *dest2, *dest3, *dest4, *dest5, *dest6, *dest_ref1;
|
||||
u8 *dest_ref2, *dest_ref3, *dest_ref4, *dest_ref5, *dest_ref6;
|
||||
u8 *dest_ptrs[6], *buffs[TEST_SOURCES];
|
||||
|
||||
int align, size;
|
||||
unsigned char *efence_buffs[TEST_SOURCES];
|
||||
unsigned int offset;
|
||||
u8 *ubuffs[TEST_SOURCES];
|
||||
u8 *udest_ptrs[6];
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 16, 2 * (6 * TEST_SOURCES * 32))) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
g_tbls = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest4 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest5 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest6 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref1 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref2 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref3 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref4 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref5 = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref6 = buf;
|
||||
|
||||
dest_ptrs[0] = dest1;
|
||||
dest_ptrs[1] = dest2;
|
||||
dest_ptrs[2] = dest3;
|
||||
dest_ptrs[3] = dest4;
|
||||
dest_ptrs[4] = dest5;
|
||||
dest_ptrs[5] = dest6;
|
||||
|
||||
// Test of all zeros
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
memset(buffs[i], 0, TEST_LEN);
|
||||
|
||||
memset(dest1, 0, TEST_LEN);
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest3, 0, TEST_LEN);
|
||||
memset(dest4, 0, TEST_LEN);
|
||||
memset(dest5, 0, TEST_LEN);
|
||||
memset(dest6, 0, TEST_LEN);
|
||||
memset(dest_ref1, 0, TEST_LEN);
|
||||
memset(dest_ref2, 0, TEST_LEN);
|
||||
memset(dest_ref3, 0, TEST_LEN);
|
||||
memset(dest_ref4, 0, TEST_LEN);
|
||||
memset(dest_ref5, 0, TEST_LEN);
|
||||
memset(dest_ref6, 0, TEST_LEN);
|
||||
memset(g1, 2, TEST_SOURCES);
|
||||
memset(g2, 1, TEST_SOURCES);
|
||||
memset(g3, 7, TEST_SOURCES);
|
||||
memset(g4, 9, TEST_SOURCES);
|
||||
memset(g5, 4, TEST_SOURCES);
|
||||
memset(g6, 0xe6, TEST_SOURCES);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[32 * TEST_SOURCES + i * 32]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[64 * TEST_SOURCES + i * 32]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[96 * TEST_SOURCES + i * 32]);
|
||||
gf_vect_mul_init(g5[i], &g_tbls[128 * TEST_SOURCES + i * 32]);
|
||||
gf_vect_mul_init(g6[i], &g_tbls[160 * TEST_SOURCES + i * 32]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES], buffs,
|
||||
dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES], buffs,
|
||||
dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES], buffs,
|
||||
dest_ref4);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES], buffs,
|
||||
dest_ref5);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES], buffs,
|
||||
dest_ref6);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test2\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test3\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test4\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test5\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest5, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test6\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref6, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest6, 25);
|
||||
return -1;
|
||||
}
|
||||
putchar('.');
|
||||
|
||||
// Rand data test
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
g5[i] = rand();
|
||||
g6[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g6[i], &g_tbls[(160 * TEST_SOURCES) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
|
||||
buffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
|
||||
buffs, dest_ref4);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
|
||||
buffs, dest_ref5);
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES],
|
||||
buffs, dest_ref6);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest5, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test6 %d\n", rtest);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref6, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest6, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Rand data test with varied parameters
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
g5[i] = rand();
|
||||
g6[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[32 * srcs], buffs,
|
||||
dest_ref2);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[64 * srcs], buffs,
|
||||
dest_ref3);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[96 * srcs], buffs,
|
||||
dest_ref4);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[128 * srcs], buffs,
|
||||
dest_ref5);
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[160 * srcs], buffs,
|
||||
dest_ref6);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test1 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref2, dest2, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test2 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref3, dest3, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test3 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref4, dest4, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test4 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref5, dest5, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test5 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest5, 25);
|
||||
return -1;
|
||||
}
|
||||
if (0 != memcmp(dest_ref6, dest6, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test6 srcs=%d\n", srcs);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref6, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest6, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
|
||||
efence_buffs[i] = buffs[i] + TEST_LEN - size;
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
g5[i] = rand();
|
||||
g6[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g5[i], &g_tbls[(128 * TEST_SOURCES) + (i * 32)]);
|
||||
gf_vect_mul_init(g6[i], &g_tbls[(160 * TEST_SOURCES) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[32 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[64 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[96 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref4);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[128 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref5);
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[160 * TEST_SOURCES],
|
||||
efence_buffs, dest_ref6);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest_ptrs);
|
||||
|
||||
if (0 != memcmp(dest_ref1, dest1, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test1 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest1, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref2, dest2, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test2 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest2, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref3, dest3, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test3 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest3, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref4, dest4, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test4 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest4, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref5, dest5, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test5 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest5, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 != memcmp(dest_ref6, dest6, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test6 %d\n", rtest);
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref6, align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest6, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test rand ptr alignment if available
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
|
||||
srcs = rand() % TEST_SOURCES;
|
||||
if (srcs == 0)
|
||||
continue;
|
||||
|
||||
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
|
||||
// Add random offsets
|
||||
for (i = 0; i < srcs; i++)
|
||||
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
udest_ptrs[0] = dest1 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[1] = dest2 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[2] = dest3 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[3] = dest4 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[4] = dest5 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
udest_ptrs[5] = dest6 + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
memset(dest1, 0, TEST_LEN); // zero pad to check write-over
|
||||
memset(dest2, 0, TEST_LEN);
|
||||
memset(dest3, 0, TEST_LEN);
|
||||
memset(dest4, 0, TEST_LEN);
|
||||
memset(dest5, 0, TEST_LEN);
|
||||
memset(dest6, 0, TEST_LEN);
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
ubuffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
g5[i] = rand();
|
||||
g6[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], ubuffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], ubuffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], ubuffs, dest_ref4);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], ubuffs, dest_ref5);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[160 * srcs], ubuffs, dest_ref6);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptrs);
|
||||
|
||||
if (memcmp(dest_ref1, udest_ptrs[0], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[0], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref2, udest_ptrs[1], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[1], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref3, udest_ptrs[2], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[2], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref4, udest_ptrs[3], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[3], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref5, udest_ptrs[4], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[4], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref6, udest_ptrs[5], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref6, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[5], 25);
|
||||
return -1;
|
||||
}
|
||||
// Confirm that padding around dests is unchanged
|
||||
memset(dest_ref1, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
|
||||
offset = udest_ptrs[0] - dest1;
|
||||
|
||||
if (memcmp(dest1, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad1 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest1 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad1 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[1] - dest2;
|
||||
if (memcmp(dest2, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad2 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest2 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad2 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[2] - dest3;
|
||||
if (memcmp(dest3, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad3 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest3 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad3 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[3] - dest4;
|
||||
if (memcmp(dest4, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad4 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest4 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad4 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[4] - dest5;
|
||||
if (memcmp(dest5, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad5 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest5 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad5 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
offset = udest_ptrs[5] - dest6;
|
||||
if (memcmp(dest6, dest_ref1, offset)) {
|
||||
printf("Fail rand ualign pad6 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest6 + offset + size, dest_ref1, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad6 end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test all size alignment
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
|
||||
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
|
||||
srcs = TEST_SOURCES;
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
g1[i] = rand();
|
||||
g2[i] = rand();
|
||||
g3[i] = rand();
|
||||
g4[i] = rand();
|
||||
g5[i] = rand();
|
||||
g6[i] = rand();
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
gf_vect_mul_init(g1[i], &g_tbls[i * 32]);
|
||||
gf_vect_mul_init(g2[i], &g_tbls[(32 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g3[i], &g_tbls[(64 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g4[i], &g_tbls[(96 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g5[i], &g_tbls[(128 * srcs) + (i * 32)]);
|
||||
gf_vect_mul_init(g6[i], &g_tbls[(160 * srcs) + (i * 32)]);
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref1);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[32 * srcs], buffs, dest_ref2);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[64 * srcs], buffs, dest_ref3);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[96 * srcs], buffs, dest_ref4);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[128 * srcs], buffs, dest_ref5);
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[160 * srcs], buffs, dest_ref6);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest_ptrs);
|
||||
|
||||
if (memcmp(dest_ref1, dest_ptrs[0], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref1, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[0], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref2, dest_ptrs[1], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref2, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[1], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref3, dest_ptrs[2], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref3, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[2], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref4, dest_ptrs[3], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref4, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[3], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref5, dest_ptrs[4], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref5, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[4], 25);
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest_ref6, dest_ptrs[5], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref6, 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[5], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("Pass\n");
|
||||
return 0;
|
||||
|
||||
}
|
394
erasure_code/gf_6vect_mad_avx.asm
Normal file
394
erasure_code/gf_6vect_mad_avx.asm
Normal file
@ -0,0 +1,394 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_6vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13
|
||||
%define tmp4 r14
|
||||
%define tmp5 rdi
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*10 + 5*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
movdqa [rsp+16*0],xmm6
|
||||
movdqa [rsp+16*1],xmm7
|
||||
movdqa [rsp+16*2],xmm8
|
||||
movdqa [rsp+16*3],xmm9
|
||||
movdqa [rsp+16*4],xmm10
|
||||
movdqa [rsp+16*5],xmm11
|
||||
movdqa [rsp+16*6],xmm12
|
||||
movdqa [rsp+16*7],xmm13
|
||||
movdqa [rsp+16*8],xmm14
|
||||
movdqa [rsp+16*9],xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r13, 10*16 + 1*8
|
||||
save_reg r14, 10*16 + 2*8
|
||||
save_reg r15, 10*16 + 3*8
|
||||
save_reg rdi, 10*16 + 4*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp+16*0]
|
||||
movdqa xmm7, [rsp+16*1]
|
||||
movdqa xmm8, [rsp+16*2]
|
||||
movdqa xmm9, [rsp+16*3]
|
||||
movdqa xmm10, [rsp+16*4]
|
||||
movdqa xmm11, [rsp+16*5]
|
||||
movdqa xmm12, [rsp+16*6]
|
||||
movdqa xmm13, [rsp+16*7]
|
||||
movdqa xmm14, [rsp+16*8]
|
||||
movdqa xmm15, [rsp+16*9]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r13, [rsp + 10*16 + 1*8]
|
||||
mov r14, [rsp + 10*16 + 2*8]
|
||||
mov r15, [rsp + 10*16 + 3*8]
|
||||
mov rdi, [rsp + 10*16 + 4*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r12
|
||||
%define tmp4 r13
|
||||
%define tmp5 r14
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
;;; gf_6vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 tmp4
|
||||
%define dest3 tmp2
|
||||
%define dest4 mul_array
|
||||
%define dest5 tmp5
|
||||
%define dest6 vec_i
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft4_lo xmm14
|
||||
%define xgft4_hi xmm13
|
||||
%define xgft5_lo xmm12
|
||||
%define xgft5_hi xmm11
|
||||
%define xgft6_lo xmm10
|
||||
%define xgft6_hi xmm9
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xtmph1 xmm2
|
||||
%define xtmpl1 xmm3
|
||||
%define xtmph2 xmm4
|
||||
%define xtmpl2 xmm5
|
||||
%define xtmph3 xmm6
|
||||
%define xtmpl3 xmm7
|
||||
%define xd1 xmm8
|
||||
%define xd2 xtmpl1
|
||||
%define xd3 xtmph1
|
||||
|
||||
|
||||
align 16
|
||||
global gf_6vect_mad_avx:function
|
||||
func(gf_6vect_mad_avx)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
mov tmp, vec
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
lea tmp3, [mul_array + vec_i]
|
||||
sal tmp, 6 ;Multiply by 64
|
||||
|
||||
sal vec, 5 ;Multiply by 32
|
||||
lea vec_i, [tmp + vec] ;vec_i = vec*96
|
||||
lea mul_array, [tmp + vec_i] ;mul_array = vec*160
|
||||
|
||||
vmovdqu xgft5_lo, [tmp3+2*tmp] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
vmovdqu xgft5_hi, [tmp3+2*tmp+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
|
||||
vmovdqu xgft4_lo, [tmp3+vec_i] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
|
||||
vmovdqu xgft4_hi, [tmp3+vec_i+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
|
||||
vmovdqu xgft6_lo, [tmp3+mul_array] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
|
||||
vmovdqu xgft6_hi, [tmp3+mul_array+16] ; " Fx{00}, Fx{10}, ..., Fx{f0}
|
||||
|
||||
mov dest2, [dest1+PS]
|
||||
mov dest3, [dest1+2*PS]
|
||||
mov dest4, [dest1+3*PS] ; reuse mul_array
|
||||
mov dest5, [dest1+4*PS]
|
||||
mov dest6, [dest1+5*PS] ; reuse vec_i
|
||||
mov dest1, [dest1]
|
||||
|
||||
.loop16:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
|
||||
vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
vmovdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
|
||||
;dest1
|
||||
vpshufb xtmph1, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmpl1 ;GF add high and low partials
|
||||
vpxor xd1, xtmph1
|
||||
|
||||
XLDR xd2, [dest2+pos] ;reuse xtmpl1. Get next dest vector
|
||||
XLDR xd3, [dest3+pos] ;reuse xtmph1. Get next dest vector
|
||||
|
||||
;dest2
|
||||
vpshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmpl2 ;GF add high and low partials
|
||||
vpxor xd2, xtmph2
|
||||
|
||||
;dest3
|
||||
vpshufb xtmph3, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph3, xtmpl3 ;GF add high and low partials
|
||||
vpxor xd3, xtmph3
|
||||
|
||||
XSTR [dest1+pos], xd1 ;Store result into dest1
|
||||
XSTR [dest2+pos], xd2 ;Store result into dest2
|
||||
XSTR [dest3+pos], xd3 ;Store result into dest3
|
||||
|
||||
;dest4
|
||||
XLDR xd1, [dest4+pos] ;Get next dest vector
|
||||
vpshufb xtmph1, xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl1, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
|
||||
vpxor xd1, xd1, xtmph1
|
||||
|
||||
XLDR xd2, [dest5+pos] ;reuse xtmpl1. Get next dest vector
|
||||
XLDR xd3, [dest6+pos] ;reuse xtmph1. Get next dest vector
|
||||
|
||||
;dest5
|
||||
vpshufb xtmph2, xgft5_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl2, xgft5_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
|
||||
vpxor xd2, xd2, xtmph2
|
||||
|
||||
;dest6
|
||||
vpshufb xtmph3, xgft6_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl3, xgft6_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials
|
||||
vpxor xd3, xd3, xtmph3
|
||||
|
||||
XSTR [dest4+pos], xd1 ;Store result into dest4
|
||||
XSTR [dest5+pos], xd2 ;Store result into dest5
|
||||
XSTR [dest6+pos], xd3 ;Store result into dest6
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
.lessthan16:
|
||||
;; Tail len
|
||||
;; Do one more overlap pass
|
||||
;; Overlapped offset length-16
|
||||
mov tmp, len ;Backup len as len=rdi
|
||||
|
||||
XLDR x0, [src+tmp] ;Get next source vector
|
||||
XLDR xd1, [dest4+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest5+tmp] ;reuse xtmpl1. Get next dest vector
|
||||
XLDR xd3, [dest6+tmp] ;reuse xtmph1. Get next dest vector
|
||||
|
||||
sub len, pos
|
||||
|
||||
vmovdqa xtmph3, [constip16] ;Load const of i + 16
|
||||
vpinsrb xtmpl3, len.w, 15
|
||||
vpshufb xtmpl3, xmask0f ;Broadcast len to all bytes
|
||||
vpcmpgtb xtmpl3, xtmpl3, xtmph3
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
;dest4
|
||||
vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
vpand xgft4_hi, xgft4_hi, xtmpl3
|
||||
vpxor xd1, xd1, xgft4_hi
|
||||
|
||||
;dest5
|
||||
vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft5_hi, xgft5_hi, xgft5_lo ;GF add high and low partials
|
||||
vpand xgft5_hi, xgft5_hi, xtmpl3
|
||||
vpxor xd2, xd2, xgft5_hi
|
||||
|
||||
;dest6
|
||||
vpshufb xgft6_hi, xgft6_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft6_hi, xgft6_hi, xgft6_lo ;GF add high and low partials
|
||||
vpand xgft6_hi, xgft6_hi, xtmpl3
|
||||
vpxor xd3, xd3, xgft6_hi
|
||||
|
||||
XSTR [dest4+tmp], xd1 ;Store result into dest4
|
||||
XSTR [dest5+tmp], xd2 ;Store result into dest5
|
||||
XSTR [dest6+tmp], xd3 ;Store result into dest6
|
||||
|
||||
vmovdqu xgft4_lo, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
vmovdqu xgft4_hi, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
vmovdqu xgft5_lo, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
vmovdqu xgft5_hi, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
vmovdqu xgft6_lo, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
vmovdqu xgft6_hi, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
XLDR xd1, [dest1+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest2+tmp] ;reuse xtmpl1. Get next dest vector
|
||||
XLDR xd3, [dest3+tmp] ;reuse xtmph1. Get next dest3 vector
|
||||
|
||||
;dest1
|
||||
vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
vpand xgft4_hi, xgft4_hi, xtmpl3
|
||||
vpxor xd1, xd1, xgft4_hi
|
||||
|
||||
;dest2
|
||||
vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft5_hi, xgft5_hi, xgft5_lo ;GF add high and low partials
|
||||
vpand xgft5_hi, xgft5_hi, xtmpl3
|
||||
vpxor xd2, xd2, xgft5_hi
|
||||
|
||||
;dest3
|
||||
vpshufb xgft6_hi, xgft6_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft6_hi, xgft6_hi, xgft6_lo ;GF add high and low partials
|
||||
vpand xgft6_hi, xgft6_hi, xtmpl3
|
||||
vpxor xd3, xd3, xgft6_hi
|
||||
|
||||
XSTR [dest1+tmp], xd1 ;Store result into dest1
|
||||
XSTR [dest2+tmp], xd2 ;Store result into dest2
|
||||
XSTR [dest3+tmp], xd3 ;Store result into dest3
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
constip16:
|
||||
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_6vect_mad_avx, 02, 01, 0210
|
400
erasure_code/gf_6vect_mad_avx2.asm
Normal file
400
erasure_code/gf_6vect_mad_avx2.asm
Normal file
@ -0,0 +1,400 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_6vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*10 + 3*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
movdqa [rsp+16*0],xmm6
|
||||
movdqa [rsp+16*1],xmm7
|
||||
movdqa [rsp+16*2],xmm8
|
||||
movdqa [rsp+16*3],xmm9
|
||||
movdqa [rsp+16*4],xmm10
|
||||
movdqa [rsp+16*5],xmm11
|
||||
movdqa [rsp+16*6],xmm12
|
||||
movdqa [rsp+16*7],xmm13
|
||||
movdqa [rsp+16*8],xmm14
|
||||
movdqa [rsp+16*9],xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r13, 10*16 + 1*8
|
||||
save_reg r15, 10*16 + 2*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp+16*0]
|
||||
movdqa xmm7, [rsp+16*1]
|
||||
movdqa xmm8, [rsp+16*2]
|
||||
movdqa xmm9, [rsp+16*3]
|
||||
movdqa xmm10, [rsp+16*4]
|
||||
movdqa xmm11, [rsp+16*5]
|
||||
movdqa xmm12, [rsp+16*6]
|
||||
movdqa xmm13, [rsp+16*7]
|
||||
movdqa xmm14, [rsp+16*8]
|
||||
movdqa xmm15, [rsp+16*9]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r13, [rsp + 10*16 + 1*8]
|
||||
mov r15, [rsp + 10*16 + 2*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 r12
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
;;; gf_6vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 tmp3
|
||||
%define dest3 tmp2
|
||||
%define dest4 mul_array
|
||||
%define dest5 vec
|
||||
%define dest6 vec_i
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f ymm15
|
||||
%define xmask0fx xmm15
|
||||
%define xgft1_lo ymm14
|
||||
%define xgft2_lo ymm13
|
||||
%define xgft3_lo ymm12
|
||||
%define xgft4_lo ymm11
|
||||
%define xgft5_lo ymm10
|
||||
%define xgft6_lo ymm9
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xtmpl ymm2
|
||||
%define xtmplx xmm2
|
||||
%define xtmph ymm3
|
||||
%define xtmphx xmm3
|
||||
%define xd1 ymm4
|
||||
%define xd2 ymm5
|
||||
%define xd3 ymm6
|
||||
%define xd4 ymm7
|
||||
%define xd5 ymm8
|
||||
%define xd6 xd1
|
||||
|
||||
align 16
|
||||
global gf_6vect_mad_avx2:function
|
||||
func(gf_6vect_mad_avx2)
|
||||
FUNC_SAVE
|
||||
sub len, 32
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
mov tmp.b, 0x0f
|
||||
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
|
||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
sal vec, 5 ;Multiply by 32
|
||||
lea tmp, [mul_array + vec_i]
|
||||
mov vec_i, vec
|
||||
mov mul_array, vec
|
||||
sal vec_i, 1
|
||||
sal mul_array, 1
|
||||
add vec_i, vec ;vec_i=vec*96
|
||||
add mul_array, vec_i ;vec_i=vec*160
|
||||
|
||||
vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f}
|
||||
; " Ax{00}, Ax{10}, ..., Ax{f0}
|
||||
vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f}
|
||||
; " Bx{00}, Bx{10}, ..., Bx{f0}
|
||||
vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
vmovdqu xgft4_lo, [tmp+vec_i] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
|
||||
; " Fx{00}, Fx{10}, ..., Fx{f0}
|
||||
vmovdqu xgft5_lo, [tmp+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
; " Ex{00}, Ex{10}, ..., Ex{f0}
|
||||
vmovdqu xgft6_lo, [tmp+mul_array] ;Load array Dx{00}, Dx{01}, ..., Dx{0f}
|
||||
; " Dx{00}, Dx{10}, ..., Dx{f0}
|
||||
|
||||
mov dest2, [dest1+PS] ; reuse tmp3
|
||||
mov dest3, [dest1+2*PS] ; reuse tmp2
|
||||
mov dest4, [dest1+3*PS] ; reuse mul_array
|
||||
mov dest5, [dest1+4*PS] ; reuse vec
|
||||
mov dest6, [dest1+5*PS] ; reuse vec_i
|
||||
mov dest1, [dest1]
|
||||
|
||||
.loop32:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
XLDR xd2, [dest2+pos] ;Get next dest vector
|
||||
XLDR xd3, [dest3+pos] ;Get next dest vector
|
||||
XLDR xd4, [dest4+pos] ;Get next dest vector
|
||||
XLDR xd5, [dest5+pos] ;Get next dest vector
|
||||
|
||||
vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
||||
vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
||||
|
||||
;dest1
|
||||
vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
||||
vpxor xd1, xd1, xtmph ;xd1 += partial
|
||||
|
||||
XSTR [dest1+pos], xd1 ;Store result into dest1
|
||||
|
||||
;dest2
|
||||
vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
||||
vpxor xd2, xd2, xtmph ;xd2 += partial
|
||||
|
||||
;dest3
|
||||
vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
||||
vpxor xd3, xd3, xtmph ;xd3 += partial
|
||||
|
||||
XLDR xd6, [dest6+pos] ;reuse xd1. Get next dest vector
|
||||
|
||||
;dest4
|
||||
vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
||||
vpxor xd4, xd4, xtmph ;xd4 += partial
|
||||
|
||||
;dest5
|
||||
vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
|
||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft5_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
||||
vpxor xd5, xd5, xtmph ;xd5 += partial
|
||||
|
||||
;dest6
|
||||
vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo
|
||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft6_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
||||
vpxor xd6, xd6, xtmph ;xd6 += partial
|
||||
|
||||
XSTR [dest2+pos], xd2 ;Store result into dest2
|
||||
XSTR [dest3+pos], xd3 ;Store result into dest3
|
||||
XSTR [dest4+pos], xd4 ;Store result into dest4
|
||||
XSTR [dest5+pos], xd5 ;Store result into dest5
|
||||
XSTR [dest6+pos], xd6 ;Store result into dest6
|
||||
|
||||
add pos, 32 ;Loop on 32 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop32
|
||||
|
||||
lea tmp, [len + 32]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
.lessthan32:
|
||||
;; Tail len
|
||||
;; Do one more overlap pass
|
||||
mov tmp.b, 0x1f
|
||||
vpinsrb xtmphx, xtmphx, tmp.w, 0
|
||||
vpbroadcastb xtmph, xtmphx ;Construct mask 0x1f1f1f...
|
||||
|
||||
mov tmp, len ;Overlapped offset length-32
|
||||
|
||||
XLDR x0, [src+tmp] ;Get next source vector
|
||||
XLDR xd1, [dest1+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest2+tmp] ;Get next dest vector
|
||||
XLDR xd3, [dest3+tmp] ;Get next dest vector
|
||||
XLDR xd4, [dest4+tmp] ;Get next dest vector
|
||||
XLDR xd5, [dest5+tmp] ;Get next dest vector
|
||||
|
||||
sub len, pos
|
||||
|
||||
vpinsrb xtmplx, xtmplx, len.w, 15
|
||||
vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
|
||||
vpshufb xtmpl, xtmpl, xtmph ;Broadcast len to all bytes. xtmph=0x1f1f1f...
|
||||
vpcmpgtb xtmpl, xtmpl, [constip32]
|
||||
|
||||
vpand xtmph, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
vperm2i128 xtmpa, xtmph, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi
|
||||
vperm2i128 x0, xtmph, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo
|
||||
|
||||
;dest1
|
||||
vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
|
||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xgft1_lo ;GF add high and low partials
|
||||
vpand xtmph, xtmph, xtmpl
|
||||
vpxor xd1, xd1, xtmph ;xd1 += partial
|
||||
|
||||
XSTR [dest1+tmp], xd1 ;Store result into dest1
|
||||
|
||||
;dest2
|
||||
vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
|
||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xgft2_lo ;GF add high and low partials
|
||||
vpand xtmph, xtmph, xtmpl
|
||||
vpxor xd2, xd2, xtmph ;xd2 += partial
|
||||
|
||||
;dest3
|
||||
vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
|
||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xgft3_lo ;GF add high and low partials
|
||||
vpand xtmph, xtmph, xtmpl
|
||||
vpxor xd3, xd3, xtmph ;xd3 += partial
|
||||
|
||||
XLDR xd6, [dest6+tmp] ;reuse xd1. Get next dest vector
|
||||
|
||||
;dest4
|
||||
vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
|
||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xgft4_lo ;GF add high and low partials
|
||||
vpand xtmph, xtmph, xtmpl
|
||||
vpxor xd4, xd4, xtmph ;xd4 += partial
|
||||
|
||||
;dest5
|
||||
vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
|
||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xgft5_lo ;GF add high and low partials
|
||||
vpand xtmph, xtmph, xtmpl
|
||||
vpxor xd5, xd5, xtmph ;xd5 += partial
|
||||
|
||||
;dest6
|
||||
vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo
|
||||
vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xgft6_lo ;GF add high and low partials
|
||||
vpand xtmph, xtmph, xtmpl
|
||||
vpxor xd6, xd6, xtmph ;xd6 += partial
|
||||
|
||||
XSTR [dest2+tmp], xd2 ;Store result into dest2
|
||||
XSTR [dest3+tmp], xd3 ;Store result into dest3
|
||||
XSTR [dest4+tmp], xd4 ;Store result into dest4
|
||||
XSTR [dest5+tmp], xd5 ;Store result into dest5
|
||||
XSTR [dest6+tmp], xd6 ;Store result into dest6
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
align 32
|
||||
constip32:
|
||||
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
|
||||
ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_6vect_mad_avx2, 04, 01, 0211
|
406
erasure_code/gf_6vect_mad_sse.asm
Normal file
406
erasure_code/gf_6vect_mad_sse.asm
Normal file
@ -0,0 +1,406 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_6vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define PS 8
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp2 r10
|
||||
%define tmp3 r13
|
||||
%define tmp4 r14
|
||||
%define tmp5 rdi
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define stack_size 16*10 + 5*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
movdqa [rsp+16*0],xmm6
|
||||
movdqa [rsp+16*1],xmm7
|
||||
movdqa [rsp+16*2],xmm8
|
||||
movdqa [rsp+16*3],xmm9
|
||||
movdqa [rsp+16*4],xmm10
|
||||
movdqa [rsp+16*5],xmm11
|
||||
movdqa [rsp+16*6],xmm12
|
||||
movdqa [rsp+16*7],xmm13
|
||||
movdqa [rsp+16*8],xmm14
|
||||
movdqa [rsp+16*9],xmm15
|
||||
save_reg r12, 10*16 + 0*8
|
||||
save_reg r13, 10*16 + 1*8
|
||||
save_reg r14, 10*16 + 2*8
|
||||
save_reg r15, 10*16 + 3*8
|
||||
save_reg rdi, 10*16 + 4*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp+16*0]
|
||||
movdqa xmm7, [rsp+16*1]
|
||||
movdqa xmm8, [rsp+16*2]
|
||||
movdqa xmm9, [rsp+16*3]
|
||||
movdqa xmm10, [rsp+16*4]
|
||||
movdqa xmm11, [rsp+16*5]
|
||||
movdqa xmm12, [rsp+16*6]
|
||||
movdqa xmm13, [rsp+16*7]
|
||||
movdqa xmm14, [rsp+16*8]
|
||||
movdqa xmm15, [rsp+16*9]
|
||||
mov r12, [rsp + 10*16 + 0*8]
|
||||
mov r13, [rsp + 10*16 + 1*8]
|
||||
mov r14, [rsp + 10*16 + 2*8]
|
||||
mov r15, [rsp + 10*16 + 3*8]
|
||||
mov rdi, [rsp + 10*16 + 4*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp2 r10
|
||||
%define tmp3 r12
|
||||
%define tmp4 r13
|
||||
%define tmp5 r14
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%macro FUNC_SAVE 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
%endmacro
|
||||
%macro FUNC_RESTORE 0
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
;;; gf_6vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest1 arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%define dest2 mul_array
|
||||
%define dest3 tmp2
|
||||
%define dest4 tmp4
|
||||
%define dest5 tmp5
|
||||
%define dest6 vec_i
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR movdqu
|
||||
%define XSTR movdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR movdqa
|
||||
%define XSTR movdqa
|
||||
%else
|
||||
%define XLDR movntdqa
|
||||
%define XSTR movntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft4_lo xmm14
|
||||
%define xgft4_hi xmm13
|
||||
%define xgft5_lo xmm12
|
||||
%define xgft5_hi xmm11
|
||||
%define xgft6_lo xmm10
|
||||
%define xgft6_hi xmm9
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xtmph1 xmm2
|
||||
%define xtmpl1 xmm3
|
||||
%define xtmph2 xmm4
|
||||
%define xtmpl2 xmm5
|
||||
%define xtmph3 xmm6
|
||||
%define xtmpl3 xmm7
|
||||
%define xd1 xmm8
|
||||
%define xd2 xtmpl1
|
||||
%define xd3 xtmph1
|
||||
|
||||
|
||||
align 16
|
||||
global gf_6vect_mad_sse:function
|
||||
func(gf_6vect_mad_sse)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
|
||||
xor pos, pos
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
|
||||
mov tmp, vec
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
lea tmp3, [mul_array + vec_i]
|
||||
sal tmp, 6 ;Multiply by 64
|
||||
|
||||
sal vec, 5 ;Multiply by 32
|
||||
lea vec_i, [tmp + vec] ;vec_i = 96
|
||||
lea mul_array, [tmp + vec_i] ;mul_array = 160
|
||||
|
||||
movdqu xgft5_lo, [tmp3+2*tmp] ;Load array Ex{00}, Ex{01}, ..., Ex{0f}
|
||||
movdqu xgft5_hi, [tmp3+2*tmp+16] ; " Ex{00}, Ex{10}, ..., Ex{f0}
|
||||
movdqu xgft4_lo, [tmp3+vec_i] ;Load array Dx{00}, Dx{01}, Dx{02}, ...
|
||||
movdqu xgft4_hi, [tmp3+vec_i+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
|
||||
movdqu xgft6_lo, [tmp3+mul_array] ;Load array Fx{00}, Fx{01}, ..., Fx{0f}
|
||||
movdqu xgft6_hi, [tmp3+mul_array+16] ; " Fx{00}, Fx{10}, ..., Fx{f0}
|
||||
|
||||
mov dest2, [dest1+PS]
|
||||
mov dest3, [dest1+2*PS]
|
||||
mov dest4, [dest1+3*PS] ; reuse mul_array
|
||||
mov dest5, [dest1+4*PS]
|
||||
mov dest6, [dest1+5*PS] ; reuse vec_i
|
||||
mov dest1, [dest1]
|
||||
|
||||
.loop16:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
|
||||
movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
movdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
XLDR xd1, [dest1+pos] ;Get next dest vector
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
;dest1
|
||||
pshufb xtmph1, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph1, xtmpl1 ;GF add high and low partials
|
||||
pxor xd1, xtmph1
|
||||
|
||||
XLDR xd2, [dest2+pos] ;reuse xtmpl1. Get next dest vector
|
||||
XLDR xd3, [dest3+pos] ;reuse xtmph1. Get next dest3 vector
|
||||
|
||||
;dest2
|
||||
pshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph2, xtmpl2 ;GF add high and low partials
|
||||
pxor xd2, xtmph2
|
||||
|
||||
;dest3
|
||||
pshufb xtmph3, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph3, xtmpl3 ;GF add high and low partials
|
||||
pxor xd3, xtmph3
|
||||
|
||||
XSTR [dest1+pos], xd1 ;Store result into dest1
|
||||
XSTR [dest2+pos], xd2 ;Store result into dest2
|
||||
XSTR [dest3+pos], xd3 ;Store result into dest3
|
||||
|
||||
movdqa xtmph1, xgft4_hi ;Reload const array registers
|
||||
movdqa xtmpl1, xgft4_lo ;Reload const array registers
|
||||
movdqa xtmph2, xgft5_hi ;Reload const array registers
|
||||
movdqa xtmpl2, xgft5_lo ;Reload const array registers
|
||||
movdqa xtmph3, xgft6_hi ;Reload const array registers
|
||||
movdqa xtmpl3, xgft6_lo ;Reload const array registers
|
||||
|
||||
;dest4
|
||||
XLDR xd1, [dest4+pos] ;Get next dest vector
|
||||
pshufb xtmph1, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph1, xtmpl1 ;GF add high and low partials
|
||||
pxor xd1, xtmph1
|
||||
|
||||
XLDR xd2, [dest5+pos] ;reuse xtmpl1. Get next dest vector
|
||||
XLDR xd3, [dest6+pos] ;reuse xtmph1. Get next dest vector
|
||||
|
||||
;dest5
|
||||
pshufb xtmph2, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph2, xtmpl2 ;GF add high and low partials
|
||||
pxor xd2, xtmph2
|
||||
|
||||
;dest6
|
||||
pshufb xtmph3, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph3, xtmpl3 ;GF add high and low partials
|
||||
pxor xd3, xtmph3
|
||||
|
||||
XSTR [dest4+pos], xd1 ;Store result into dest4
|
||||
XSTR [dest5+pos], xd2 ;Store result into dest5
|
||||
XSTR [dest6+pos], xd3 ;Store result into dest6
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
.lessthan16:
|
||||
;; Tail len
|
||||
;; Do one more overlap pass
|
||||
;; Overlapped offset length-16
|
||||
mov tmp, len ;Backup len as len=rdi
|
||||
|
||||
XLDR x0, [src+tmp] ;Get next source vector
|
||||
XLDR xd1, [dest4+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest5+tmp] ;reuse xtmpl1. Get next dest vector
|
||||
XLDR xd3, [dest6+tmp] ;reuse xtmph1. Get next dest vector
|
||||
|
||||
sub len, pos
|
||||
|
||||
movdqa xtmph3, [constip16] ;Load const of i + 16
|
||||
pinsrb xtmpl3, len.w, 15
|
||||
pshufb xtmpl3, xmask0f ;Broadcast len to all bytes
|
||||
pcmpgtb xtmpl3, xtmph3
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
;dest4
|
||||
pshufb xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
pand xgft4_hi, xtmpl3
|
||||
pxor xd1, xgft4_hi
|
||||
|
||||
;dest5
|
||||
pshufb xgft5_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft5_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft5_hi, xgft5_lo ;GF add high and low partials
|
||||
pand xgft5_hi, xtmpl3
|
||||
pxor xd2, xgft5_hi
|
||||
|
||||
;dest6
|
||||
pshufb xgft6_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft6_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft6_hi, xgft6_lo ;GF add high and low partials
|
||||
pand xgft6_hi, xtmpl3
|
||||
pxor xd3, xgft6_hi
|
||||
|
||||
XSTR [dest4+tmp], xd1 ;Store result into dest4
|
||||
XSTR [dest5+tmp], xd2 ;Store result into dest5
|
||||
XSTR [dest6+tmp], xd3 ;Store result into dest6
|
||||
|
||||
movdqu xgft4_lo, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ...
|
||||
movdqu xgft4_hi, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
|
||||
movdqu xgft5_lo, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ...
|
||||
movdqu xgft5_hi, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
|
||||
movdqu xgft6_lo, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
movdqu xgft6_hi, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
XLDR xd1, [dest1+tmp] ;Get next dest vector
|
||||
XLDR xd2, [dest2+tmp] ;reuse xtmpl1. Get next dest vector
|
||||
XLDR xd3, [dest3+tmp] ;reuse xtmph1. Get next dest3 vector
|
||||
|
||||
;dest1
|
||||
pshufb xgft4_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft4_hi, xgft4_lo ;GF add high and low partials
|
||||
pand xgft4_hi, xtmpl3
|
||||
pxor xd1, xgft4_hi
|
||||
|
||||
;dest2
|
||||
pshufb xgft5_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft5_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft5_hi, xgft5_lo ;GF add high and low partials
|
||||
pand xgft5_hi, xtmpl3
|
||||
pxor xd2, xgft5_hi
|
||||
|
||||
;dest3
|
||||
pshufb xgft6_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft6_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft6_hi, xgft6_lo ;GF add high and low partials
|
||||
pand xgft6_hi, xtmpl3
|
||||
pxor xd3, xgft6_hi
|
||||
|
||||
XSTR [dest1+tmp], xd1 ;Store result into dest1
|
||||
XSTR [dest2+tmp], xd2 ;Store result into dest2
|
||||
XSTR [dest3+tmp], xd3 ;Store result into dest3
|
||||
|
||||
.return_pass:
|
||||
FUNC_RESTORE
|
||||
mov return, 0
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
constip16:
|
||||
ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_6vect_mad_sse, 00, 01, 020f
|
225
erasure_code/gf_inverse_test.c
Normal file
225
erasure_code/gf_inverse_test.c
Normal file
@ -0,0 +1,225 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include <assert.h>
|
||||
|
||||
#include "erasure_code.h"
|
||||
|
||||
#define TEST_LEN 8192
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 128
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 200
|
||||
#endif
|
||||
|
||||
#define KMAX TEST_SOURCES
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void matrix_mult(u8 * a, u8 * b, u8 * c, int n)
|
||||
{
|
||||
int i, j, k;
|
||||
u8 d;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
for (j = 0; j < n; j++) {
|
||||
d = 0;
|
||||
for (k = 0; k < n; k++) {
|
||||
d ^= gf_mul(a[n * i + k], b[n * k + j]);
|
||||
}
|
||||
c[i * n + j] = d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void print_matrix(u8 * a, int n)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
for (j = 0; j < n; j++) {
|
||||
printf(" %2x", a[i * n + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int is_ident(u8 * a, const int n)
|
||||
{
|
||||
int i, j;
|
||||
u8 c;
|
||||
for (i = 0; i < n; i++) {
|
||||
for (j = 0; j < n; j++) {
|
||||
c = *a++;
|
||||
if (i == j)
|
||||
c--;
|
||||
if (c != 0)
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int inv_test(u8 * in, u8 * inv, u8 * sav, int n)
|
||||
{
|
||||
memcpy(sav, in, n * n);
|
||||
|
||||
if (gf_invert_matrix(in, inv, n)) {
|
||||
printf("Given singular matrix\n");
|
||||
print_matrix(sav, n);
|
||||
return -1;
|
||||
}
|
||||
|
||||
matrix_mult(inv, sav, in, n);
|
||||
|
||||
if (is_ident(in, n)) {
|
||||
printf("fail\n");
|
||||
print_matrix(sav, n);
|
||||
print_matrix(inv, n);
|
||||
print_matrix(in, n);
|
||||
return -1;
|
||||
}
|
||||
putchar('.');
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, k, t;
|
||||
u8 *test_mat, *save_mat, *invr_mat;
|
||||
|
||||
u8 test1[] = { 1, 1, 6,
|
||||
1, 1, 1,
|
||||
7, 1, 9
|
||||
};
|
||||
|
||||
u8 test2[] = { 0, 1, 6,
|
||||
1, 0, 1,
|
||||
0, 1, 9
|
||||
};
|
||||
|
||||
u8 test3[] = { 0, 0, 1,
|
||||
1, 0, 0,
|
||||
0, 1, 1
|
||||
};
|
||||
|
||||
u8 test4[] = { 0, 1, 6, 7,
|
||||
1, 1, 0, 0,
|
||||
0, 1, 2, 3,
|
||||
3, 2, 2, 3
|
||||
}; // = row3+3*row2
|
||||
|
||||
printf("gf_inverse_test: max=%d ", KMAX);
|
||||
|
||||
test_mat = malloc(KMAX * KMAX);
|
||||
save_mat = malloc(KMAX * KMAX);
|
||||
invr_mat = malloc(KMAX * KMAX);
|
||||
|
||||
if (NULL == test_mat || NULL == save_mat || NULL == invr_mat)
|
||||
return -1;
|
||||
|
||||
// Test with lots of leading 1's
|
||||
k = 3;
|
||||
memcpy(test_mat, test1, k * k);
|
||||
if (inv_test(test_mat, invr_mat, save_mat, k))
|
||||
return -1;
|
||||
|
||||
// Test with leading zeros
|
||||
k = 3;
|
||||
memcpy(test_mat, test2, k * k);
|
||||
if (inv_test(test_mat, invr_mat, save_mat, k))
|
||||
return -1;
|
||||
|
||||
// Test 3
|
||||
k = 3;
|
||||
memcpy(test_mat, test3, k * k);
|
||||
if (inv_test(test_mat, invr_mat, save_mat, k))
|
||||
return -1;
|
||||
|
||||
// Test 4 - try a singular matrix
|
||||
k = 4;
|
||||
memcpy(test_mat, test4, k * k);
|
||||
if (!gf_invert_matrix(test_mat, invr_mat, k)) {
|
||||
printf("Fail: didn't catch singular matrix\n");
|
||||
print_matrix(test4, 4);
|
||||
return -1;
|
||||
}
|
||||
// Do random test of size KMAX
|
||||
k = KMAX;
|
||||
|
||||
for (i = 0; i < k * k; i++)
|
||||
test_mat[i] = save_mat[i] = rand();
|
||||
|
||||
if (gf_invert_matrix(test_mat, invr_mat, k)) {
|
||||
printf("rand picked a singular matrix, try again\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
matrix_mult(invr_mat, save_mat, test_mat, k);
|
||||
|
||||
if (is_ident(test_mat, k)) {
|
||||
printf("fail\n");
|
||||
print_matrix(save_mat, k);
|
||||
print_matrix(invr_mat, k);
|
||||
print_matrix(test_mat, k);
|
||||
return -1;
|
||||
}
|
||||
// Do Randoms. Random size and coefficients
|
||||
for (t = 0; t < RANDOMS; t++) {
|
||||
k = rand() % KMAX;
|
||||
|
||||
for (i = 0; i < k * k; i++)
|
||||
test_mat[i] = save_mat[i] = rand();
|
||||
|
||||
if (gf_invert_matrix(test_mat, invr_mat, k))
|
||||
continue;
|
||||
|
||||
matrix_mult(invr_mat, save_mat, test_mat, k);
|
||||
|
||||
if (is_ident(test_mat, k)) {
|
||||
printf("fail rand k=%d\n", k);
|
||||
print_matrix(save_mat, k);
|
||||
print_matrix(invr_mat, k);
|
||||
print_matrix(test_mat, k);
|
||||
return -1;
|
||||
}
|
||||
if (0 == (t % 8))
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
printf(" Pass\n");
|
||||
return 0;
|
||||
}
|
166
erasure_code/gf_vect_dot_prod_1tbl.c
Normal file
166
erasure_code/gf_vect_dot_prod_1tbl.c
Normal file
@ -0,0 +1,166 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "test.h"
|
||||
#include "erasure_code.h"
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 10
|
||||
# define TEST_LEN 8*1024
|
||||
# define TEST_LOOPS 4000
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 10
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN GT_L3_CACHE / TEST_SOURCES
|
||||
# define TEST_LOOPS 10
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
// Global GF(256) tables
|
||||
u8 gff[256];
|
||||
u8 gflog[256];
|
||||
u8 gf_mul_table[256 * 256];
|
||||
|
||||
void mk_gf_field(void)
|
||||
{
|
||||
int i;
|
||||
u8 s = 1;
|
||||
gflog[0] = 0;
|
||||
|
||||
for (i = 0; i < 256; i++) {
|
||||
gff[i] = s;
|
||||
gflog[s] = i;
|
||||
s = (s << 1) ^ ((s & 0x80) ? 0x1d : 0); // mult by GF{2}
|
||||
}
|
||||
}
|
||||
|
||||
void mk_gf_mul_table(u8 * table)
|
||||
{
|
||||
// Populate a single table with all multiply combinations for a fast,
|
||||
// single-table lookup of GF(2^8) multiply at the expense of memory.
|
||||
int i, j;
|
||||
for (i = 0; i < 256; i++)
|
||||
for (j = 0; j < 256; j++)
|
||||
table[i * 256 + j] = gf_mul(i, j);
|
||||
}
|
||||
|
||||
void gf_vect_dot_prod_ref(int len, int vlen, u8 * v, u8 ** src, u8 * dest)
|
||||
{
|
||||
int i, j;
|
||||
u8 s;
|
||||
for (i = 0; i < len; i++) {
|
||||
s = 0;
|
||||
for (j = 0; j < vlen; j++)
|
||||
s ^= gf_mul(src[j][i], v[j]);
|
||||
|
||||
dest[i] = s;
|
||||
}
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int i, j, k;
|
||||
u8 s, vec[TEST_SOURCES], dest1[TEST_LEN], dest2[TEST_LEN];
|
||||
u8 *matrix[TEST_SOURCES];
|
||||
struct perf start, stop;
|
||||
|
||||
mk_gf_field();
|
||||
mk_gf_mul_table(gf_mul_table);
|
||||
|
||||
//generate random vector and matrix/data
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
vec[i] = rand();
|
||||
|
||||
if (!(matrix[i] = malloc(TEST_LEN))) {
|
||||
fprintf(stderr, "Error failure\n\n");
|
||||
return -1;
|
||||
}
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
matrix[i][j] = rand();
|
||||
|
||||
}
|
||||
|
||||
gf_vect_dot_prod_ref(TEST_LEN, TEST_SOURCES, vec, matrix, dest1);
|
||||
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++)
|
||||
gf_vect_dot_prod_ref(TEST_LEN, TEST_SOURCES, vec, matrix, dest1);
|
||||
|
||||
perf_stop(&stop);
|
||||
printf("gf_vect_dot_prod_2tbl" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
|
||||
|
||||
// Warm up mult tables
|
||||
for (i = 0; i < TEST_LEN; i++) {
|
||||
s = 0;
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
s ^= gf_mul_table[vec[j] * 256 + matrix[j][i]];
|
||||
}
|
||||
dest2[i] = s;
|
||||
}
|
||||
|
||||
perf_start(&start);
|
||||
for (k = 0; k < TEST_LOOPS; k++) {
|
||||
for (i = 0; i < TEST_LEN; i++) {
|
||||
s = 0;
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
s ^= gf_mul_table[vec[j] * 256 + matrix[j][i]];
|
||||
}
|
||||
dest2[i] = s;
|
||||
}
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("gf_vect_dot_prod_1tbl" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * k);
|
||||
|
||||
// Compare with reference function
|
||||
if (0 != memcmp(dest1, dest2, TEST_LEN)) {
|
||||
printf("Error, different results!\n\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("Pass functional test\n");
|
||||
return 0;
|
||||
}
|
271
erasure_code/gf_vect_dot_prod_avx.asm
Normal file
271
erasure_code/gf_vect_dot_prod_avx.asm
Normal file
@ -0,0 +1,271 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_vect_dot_prod_avx(len, vec, *g_tbls, **buffs, *dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r9
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved and loaded
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 rdi ; must be saved and loaded
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define frame_size 2*8
|
||||
%define arg(x) [rsp + frame_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
rex_push_reg r12
|
||||
push_reg rdi
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop rdi
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
|
||||
;;;================== High Address;
|
||||
;;; arg4
|
||||
;;; arg3
|
||||
;;; arg2
|
||||
;;; arg1
|
||||
;;; arg0
|
||||
;;; return
|
||||
;;;<================= esp of caller
|
||||
;;; ebp
|
||||
;;;<================= ebp = esp
|
||||
;;; esi
|
||||
;;; edi
|
||||
;;; ebx
|
||||
;;;<================= esp of callee
|
||||
;;;
|
||||
;;;================== Low Address;
|
||||
|
||||
%define PS 4
|
||||
%define LOG_PS 2
|
||||
%define func(x) x:
|
||||
%define arg(x) [ebp + PS*2 + PS*x]
|
||||
|
||||
%define trans ecx ;trans is for the variables in stack
|
||||
%define arg0 trans
|
||||
%define arg0_m arg(0)
|
||||
%define arg1 trans
|
||||
%define arg1_m arg(1)
|
||||
%define arg2 arg2_m
|
||||
%define arg2_m arg(2)
|
||||
%define arg3 ebx
|
||||
%define arg4 trans
|
||||
%define arg4_m arg(4)
|
||||
%define tmp edx
|
||||
%define tmp2 edi
|
||||
%define tmp3 esi
|
||||
%define return eax
|
||||
%macro SLDR 2 ;; stack load/restore
|
||||
mov %1, %2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
mov arg3, arg(3)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
mov esp, ebp
|
||||
pop ebp
|
||||
%endmacro
|
||||
|
||||
%endif ; output formats
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest arg4
|
||||
|
||||
%define vec_i tmp2
|
||||
%define ptr tmp3
|
||||
%define pos return
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
%define vec_m arg1_m
|
||||
%define len_m arg0_m
|
||||
%define dest_m arg4_m
|
||||
%endif
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%ifidn PS,8 ; 64-bit code
|
||||
default rel
|
||||
[bits 64]
|
||||
%endif
|
||||
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm5
|
||||
%define xgft_lo xmm4
|
||||
%define xgft_hi xmm3
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp xmm2
|
||||
|
||||
align 16
|
||||
global gf_vect_dot_prod_avx:function
|
||||
func(gf_vect_dot_prod_avx)
|
||||
FUNC_SAVE
|
||||
SLDR len, len_m
|
||||
sub len, 16
|
||||
SSTR len_m, len
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
|
||||
.loop16:
|
||||
vpxor xp, xp
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
.next_vect:
|
||||
|
||||
mov ptr, [src+vec_i*PS]
|
||||
vmovdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
vmovdqu xgft_hi, [tmp+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
|
||||
add tmp, 32
|
||||
add vec_i, 1
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vpshufb xgft_hi, xgft_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft_lo, xgft_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials
|
||||
vpxor xp, xp, xgft_hi ;xp += partial
|
||||
|
||||
SLDR vec, vec_m
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
SLDR dest, dest_m
|
||||
XSTR [dest+pos], xp
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
SLDR len, len_m
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop16 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
|
||||
mask0f:
|
||||
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_vect_dot_prod_avx, 02, 05, 0061
|
280
erasure_code/gf_vect_dot_prod_avx2.asm
Normal file
280
erasure_code/gf_vect_dot_prod_avx2.asm
Normal file
@ -0,0 +1,280 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, *dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 r9
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved and loaded
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define tmp2 r10
|
||||
%define tmp3 rdi ; must be saved and loaded
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define frame_size 2*8
|
||||
%define arg(x) [rsp + frame_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
rex_push_reg r12
|
||||
push_reg rdi
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop rdi
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
|
||||
;;;================== High Address;
|
||||
;;; arg4
|
||||
;;; arg3
|
||||
;;; arg2
|
||||
;;; arg1
|
||||
;;; arg0
|
||||
;;; return
|
||||
;;;<================= esp of caller
|
||||
;;; ebp
|
||||
;;;<================= ebp = esp
|
||||
;;; esi
|
||||
;;; edi
|
||||
;;; ebx
|
||||
;;;<================= esp of callee
|
||||
;;;
|
||||
;;;================== Low Address;
|
||||
|
||||
%define PS 4
|
||||
%define LOG_PS 2
|
||||
%define func(x) x:
|
||||
%define arg(x) [ebp + PS*2 + PS*x]
|
||||
|
||||
%define trans ecx ;trans is for the variables in stack
|
||||
%define arg0 trans
|
||||
%define arg0_m arg(0)
|
||||
%define arg1 trans
|
||||
%define arg1_m arg(1)
|
||||
%define arg2 arg2_m
|
||||
%define arg2_m arg(2)
|
||||
%define arg3 ebx
|
||||
%define arg4 trans
|
||||
%define arg4_m arg(4)
|
||||
%define tmp edx
|
||||
%define tmp.w edx
|
||||
%define tmp.b dl
|
||||
%define tmp2 edi
|
||||
%define tmp3 esi
|
||||
%define return eax
|
||||
%macro SLDR 2 ;stack load/restore
|
||||
mov %1, %2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
mov arg3, arg(3)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
mov esp, ebp
|
||||
pop ebp
|
||||
%endmacro
|
||||
|
||||
%endif ; output formats
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest arg4
|
||||
|
||||
%define vec_i tmp2
|
||||
%define ptr tmp3
|
||||
%define pos return
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
%define vec_m arg1_m
|
||||
%define len_m arg0_m
|
||||
%define dest_m arg4_m
|
||||
%endif
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
default rel
|
||||
[bits 64]
|
||||
%endif
|
||||
|
||||
section .text
|
||||
|
||||
%define xmask0f ymm3
|
||||
%define xmask0fx xmm3
|
||||
%define xgft_lo ymm4
|
||||
%define xgft_hi ymm5
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xp ymm2
|
||||
|
||||
align 16
|
||||
global gf_vect_dot_prod_avx2:function
|
||||
func(gf_vect_dot_prod_avx2)
|
||||
FUNC_SAVE
|
||||
SLDR len, len_m
|
||||
sub len, 32
|
||||
SSTR len_m, len
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
mov tmp.b, 0x0f
|
||||
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
|
||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||
|
||||
.loop32:
|
||||
vpxor xp, xp
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
.next_vect:
|
||||
|
||||
mov ptr, [src+vec_i*PS]
|
||||
|
||||
vmovdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
|
||||
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
|
||||
add tmp, 32
|
||||
add vec_i, 1
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vpshufb xgft_hi, xgft_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xgft_lo, xgft_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials
|
||||
vpxor xp, xp, xgft_hi ;xp += partial
|
||||
|
||||
SLDR vec, vec_m
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
SLDR dest, dest_m
|
||||
XSTR [dest+pos], xp
|
||||
|
||||
add pos, 32 ;Loop on 32 bytes at a time
|
||||
SLDR len, len_m
|
||||
cmp pos, len
|
||||
jle .loop32
|
||||
|
||||
lea tmp, [len + 32]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-32
|
||||
jmp .loop32 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_vect_dot_prod_avx2, 04, 05, 0190
|
184
erasure_code/gf_vect_dot_prod_avx_perf.c
Normal file
184
erasure_code/gf_vect_dot_prod_avx_perf.c
Normal file
@ -0,0 +1,184 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_vect_dot_prod_avx
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 10
|
||||
# define TEST_LEN 8*1024
|
||||
# define TEST_LOOPS 40000
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 10
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
|
||||
# define TEST_LOOPS 100
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j;
|
||||
void *buf;
|
||||
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], *dest, *dest_ref;
|
||||
u8 *temp_buff, *buffs[TEST_SOURCES];
|
||||
struct perf start, stop;
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
temp_buff = buf;
|
||||
|
||||
// Performance test
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
memset(dest, 0, TEST_LEN);
|
||||
memset(temp_buff, 0, TEST_LEN);
|
||||
memset(dest_ref, 0, TEST_LEN);
|
||||
memset(g, 0, TEST_SOURCES);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (j = 0; j < TEST_SOURCES; j++)
|
||||
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
|
||||
|
||||
#ifdef DO_REF_PERF
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++)
|
||||
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("gf_vect_dot_prod_base" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
|
||||
#endif
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
|
||||
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++)
|
||||
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(dest, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("pass perf check\n");
|
||||
return 0;
|
||||
}
|
525
erasure_code/gf_vect_dot_prod_avx_test.c
Normal file
525
erasure_code/gf_vect_dot_prod_avx_test.c
Normal file
@ -0,0 +1,525 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_vect_dot_prod_avx
|
||||
#endif
|
||||
#ifndef TEST_MIN_SIZE
|
||||
# define TEST_MIN_SIZE 16
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 16
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 20
|
||||
#endif
|
||||
|
||||
#define MMAX TEST_SOURCES
|
||||
#define KMAX TEST_SOURCES
|
||||
|
||||
#ifdef EC_ALIGNED_ADDR
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 0
|
||||
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
|
||||
#else
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 32
|
||||
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, srcs, m, k, nerrs, r, err;
|
||||
void *buf;
|
||||
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
|
||||
u8 *dest, *dest_ref, *temp_buff, *buffs[TEST_SOURCES];
|
||||
u8 a[MMAX * KMAX], b[MMAX * KMAX], d[MMAX * KMAX];
|
||||
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
|
||||
|
||||
int align, size;
|
||||
unsigned char *efence_buffs[TEST_SOURCES];
|
||||
unsigned int offset;
|
||||
u8 *ubuffs[TEST_SOURCES];
|
||||
u8 *udest_ptr;
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
temp_buff = buf;
|
||||
|
||||
// Test of all zeros
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
memset(buffs[i], 0, TEST_LEN);
|
||||
|
||||
memset(dest, 0, TEST_LEN);
|
||||
memset(temp_buff, 0, TEST_LEN);
|
||||
memset(dest_ref, 0, TEST_LEN);
|
||||
memset(g, 0, TEST_SOURCES);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " \n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(dest, 25);
|
||||
return -1;
|
||||
} else
|
||||
putchar('.');
|
||||
|
||||
// Rand data test
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " 1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(dest, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Rand data test with varied parameters
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref);
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 2\n");
|
||||
dump_matrix(buffs, 5, srcs);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 5);
|
||||
printf("dprod:");
|
||||
dump(dest, 5);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
}
|
||||
|
||||
// Test erasure code using gf_vect_dot_prod
|
||||
|
||||
// Pick a first test
|
||||
m = 9;
|
||||
k = 5;
|
||||
if (m > MMAX || k > KMAX)
|
||||
return -1;
|
||||
|
||||
gf_gen_rs_matrix(a, m, k);
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// Make parity vects
|
||||
for (i = k; i < m; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
|
||||
#ifndef USEREF
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
|
||||
#else
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Random buffers in erasure
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
|
||||
err = 1 & rand();
|
||||
src_in_err[i] = err;
|
||||
if (err)
|
||||
src_err_list[nerrs++] = i;
|
||||
}
|
||||
|
||||
// construct b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * i + j] = a[k * r + j];
|
||||
}
|
||||
|
||||
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
|
||||
printf("BAD MATRIX\n");
|
||||
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
recov[i] = buffs[r];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
|
||||
#ifndef USEREF
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
|
||||
#else
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
|
||||
#endif
|
||||
|
||||
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buff, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Do more random tests
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
gf_gen_rs_matrix(a, m, k);
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// Make parity vects
|
||||
for (i = k; i < m; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
|
||||
#ifndef USEREF
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
|
||||
#else
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
|
||||
err = 1 & rand();
|
||||
src_in_err[i] = err;
|
||||
if (err)
|
||||
src_err_list[nerrs++] = i;
|
||||
}
|
||||
if (nerrs == 0) { // should have at least one error
|
||||
while ((err = (rand() % KMAX)) >= k) ;
|
||||
src_err_list[nerrs++] = err;
|
||||
src_in_err[err] = 1;
|
||||
}
|
||||
// construct b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * i + j] = a[k * r + j];
|
||||
}
|
||||
|
||||
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
|
||||
printf("BAD MATRIX\n");
|
||||
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
recov[i] = buffs[r];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
|
||||
#ifndef USEREF
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
|
||||
#else
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
|
||||
#endif
|
||||
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (i = 0; i < nerrs; i++)
|
||||
printf(" %d", src_err_list[i]);
|
||||
printf("\na:\n");
|
||||
dump_u8xu8((u8 *) a, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) d, k, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(buffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buff, 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
|
||||
efence_buffs[i] = buffs[i] + TEST_LEN - size;
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref);
|
||||
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 3\n");
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, align);
|
||||
printf("dprod:");
|
||||
dump(dest, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test rand ptr alignment if available
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
|
||||
srcs = rand() % TEST_SOURCES;
|
||||
if (srcs == 0)
|
||||
continue;
|
||||
|
||||
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
|
||||
// Add random offsets
|
||||
for (i = 0; i < srcs; i++)
|
||||
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
udest_ptr = dest + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
memset(dest, 0, TEST_LEN); // zero pad to check write-over
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
ubuffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptr);
|
||||
|
||||
if (memcmp(dest_ref, udest_ptr, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(udest_ptr, 25);
|
||||
return -1;
|
||||
}
|
||||
// Confirm that padding around dests is unchanged
|
||||
memset(dest_ref, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
|
||||
offset = udest_ptr - dest;
|
||||
|
||||
if (memcmp(dest, dest_ref, offset)) {
|
||||
printf("Fail rand ualign pad start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest + offset + size, dest_ref, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test all size alignment
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
|
||||
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
|
||||
srcs = TEST_SOURCES;
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest);
|
||||
|
||||
if (memcmp(dest_ref, dest, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(dest, 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("done all: Pass\n");
|
||||
return 0;
|
||||
}
|
290
erasure_code/gf_vect_dot_prod_base_test.c
Normal file
290
erasure_code/gf_vect_dot_prod_base_test.c
Normal file
@ -0,0 +1,290 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 250
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 20
|
||||
#endif
|
||||
|
||||
#define MMAX TEST_SOURCES
|
||||
#define KMAX TEST_SOURCES
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, m, k, nerrs, r, err;
|
||||
void *buf;
|
||||
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
|
||||
u8 *dest, *dest_ref, *temp_buff, *buffs[TEST_SOURCES];
|
||||
u8 a[MMAX * KMAX], b[MMAX * KMAX], d[MMAX * KMAX];
|
||||
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
|
||||
|
||||
printf("gf_vect_dot_prod_base: %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
temp_buff = buf;
|
||||
|
||||
// Init
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
memset(buffs[i], 0, TEST_LEN);
|
||||
|
||||
memset(dest, 0, TEST_LEN);
|
||||
memset(temp_buff, 0, TEST_LEN);
|
||||
memset(dest_ref, 0, TEST_LEN);
|
||||
memset(g, 0, TEST_SOURCES);
|
||||
|
||||
// Test erasure code using gf_vect_dot_prod
|
||||
// Pick a first test
|
||||
m = 9;
|
||||
k = 5;
|
||||
if (m > MMAX || k > KMAX)
|
||||
return -1;
|
||||
|
||||
gf_gen_cauchy1_matrix(a, m, k);
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// Make parity vects
|
||||
for (i = k; i < m; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, g_tbls, buffs, buffs[i]);
|
||||
}
|
||||
|
||||
// Random buffers in erasure
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
|
||||
err = 1 & rand();
|
||||
src_in_err[i] = err;
|
||||
if (err)
|
||||
src_err_list[nerrs++] = i;
|
||||
}
|
||||
|
||||
// construct b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * i + j] = a[k * r + j];
|
||||
}
|
||||
|
||||
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
|
||||
printf("BAD MATRIX\n");
|
||||
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
recov[i] = buffs[r];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, g_tbls, recov, temp_buff);
|
||||
|
||||
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buff, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Do more random tests
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
gf_gen_cauchy1_matrix(a, m, k);
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// Make parity vects
|
||||
for (i = k; i < m; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, g_tbls, buffs, buffs[i]);
|
||||
}
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
|
||||
err = 1 & rand();
|
||||
src_in_err[i] = err;
|
||||
if (err)
|
||||
src_err_list[nerrs++] = i;
|
||||
}
|
||||
if (nerrs == 0) { // should have at least one error
|
||||
while ((err = (rand() % KMAX)) >= k) ;
|
||||
src_err_list[nerrs++] = err;
|
||||
src_in_err[err] = 1;
|
||||
}
|
||||
// construct b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * i + j] = a[k * r + j];
|
||||
}
|
||||
|
||||
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
|
||||
printf("BAD MATRIX\n");
|
||||
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
recov[i] = buffs[r];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, g_tbls, recov, temp_buff);
|
||||
|
||||
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (i = 0; i < nerrs; i++)
|
||||
printf(" %d", src_err_list[i]);
|
||||
printf("\na:\n");
|
||||
dump_u8xu8((u8 *) a, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) d, k, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(buffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buff, 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
printf("done all: Pass\n");
|
||||
return 0;
|
||||
}
|
184
erasure_code/gf_vect_dot_prod_perf.c
Normal file
184
erasure_code/gf_vect_dot_prod_perf.c
Normal file
@ -0,0 +1,184 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_vect_dot_prod
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 10
|
||||
# define TEST_LEN 8*1024
|
||||
# define TEST_LOOPS 40000
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 10
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
|
||||
# define TEST_LOOPS 100
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j;
|
||||
void *buf;
|
||||
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], *dest, *dest_ref;
|
||||
u8 *temp_buff, *buffs[TEST_SOURCES];
|
||||
struct perf start, stop;
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
temp_buff = buf;
|
||||
|
||||
// Performance test
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
memset(dest, 0, TEST_LEN);
|
||||
memset(temp_buff, 0, TEST_LEN);
|
||||
memset(dest_ref, 0, TEST_LEN);
|
||||
memset(g, 0, TEST_SOURCES);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (j = 0; j < TEST_SOURCES; j++)
|
||||
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
|
||||
|
||||
#ifdef DO_REF_PERF
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++)
|
||||
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("gf_vect_dot_prod_base" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
|
||||
#endif
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
|
||||
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++)
|
||||
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(dest, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("pass perf check\n");
|
||||
return 0;
|
||||
}
|
271
erasure_code/gf_vect_dot_prod_sse.asm
Normal file
271
erasure_code/gf_vect_dot_prod_sse.asm
Normal file
@ -0,0 +1,271 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_vect_dot_prod_sse(len, vec, *g_tbls, **buffs, *dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 r9
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
|
||||
%define arg4 r12 ; must be saved and loaded
|
||||
%define tmp r11
|
||||
%define tmp2 r10
|
||||
%define tmp3 rdi ; must be saved and loaded
|
||||
%define return rax
|
||||
%macro SLDR 2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
%define PS 8
|
||||
%define frame_size 2*8
|
||||
%define arg(x) [rsp + frame_size + PS + PS*x]
|
||||
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
rex_push_reg r12
|
||||
push_reg rdi
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop rdi
|
||||
pop r12
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf32
|
||||
|
||||
;;;================== High Address;
|
||||
;;; arg4
|
||||
;;; arg3
|
||||
;;; arg2
|
||||
;;; arg1
|
||||
;;; arg0
|
||||
;;; return
|
||||
;;;<================= esp of caller
|
||||
;;; ebp
|
||||
;;;<================= ebp = esp
|
||||
;;; esi
|
||||
;;; edi
|
||||
;;; ebx
|
||||
;;;<================= esp of callee
|
||||
;;;
|
||||
;;;================== Low Address;
|
||||
|
||||
%define PS 4
|
||||
%define LOG_PS 2
|
||||
%define func(x) x:
|
||||
%define arg(x) [ebp + PS*2 + PS*x]
|
||||
|
||||
%define trans ecx ;trans is for the variables in stack
|
||||
%define arg0 trans
|
||||
%define arg0_m arg(0)
|
||||
%define arg1 trans
|
||||
%define arg1_m arg(1)
|
||||
%define arg2 arg2_m
|
||||
%define arg2_m arg(2)
|
||||
%define arg3 ebx
|
||||
%define arg4 trans
|
||||
%define arg4_m arg(4)
|
||||
%define tmp edx
|
||||
%define tmp2 edi
|
||||
%define tmp3 esi
|
||||
%define return eax
|
||||
%macro SLDR 2 ;; stack load/restore
|
||||
mov %1, %2
|
||||
%endmacro
|
||||
%define SSTR SLDR
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
mov arg3, arg(3)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
mov esp, ebp
|
||||
pop ebp
|
||||
%endmacro
|
||||
|
||||
%endif ; output formats
|
||||
|
||||
%define len arg0
|
||||
%define vec arg1
|
||||
%define mul_array arg2
|
||||
%define src arg3
|
||||
%define dest arg4
|
||||
|
||||
%define vec_i tmp2
|
||||
%define ptr tmp3
|
||||
%define pos return
|
||||
|
||||
%ifidn PS,4 ;32-bit code
|
||||
%define vec_m arg1_m
|
||||
%define len_m arg0_m
|
||||
%define dest_m arg4_m
|
||||
%endif
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR movdqu
|
||||
%define XSTR movdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR movdqa
|
||||
%define XSTR movdqa
|
||||
%else
|
||||
%define XLDR movntdqa
|
||||
%define XSTR movntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%ifidn PS,8 ;64-bit code
|
||||
default rel
|
||||
[bits 64]
|
||||
%endif
|
||||
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm5
|
||||
%define xgft_lo xmm4
|
||||
%define xgft_hi xmm3
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xp xmm2
|
||||
|
||||
align 16
|
||||
global gf_vect_dot_prod_sse:function
|
||||
func(gf_vect_dot_prod_sse)
|
||||
FUNC_SAVE
|
||||
SLDR len, len_m
|
||||
sub len, 16
|
||||
SSTR len_m, len
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
|
||||
.loop16:
|
||||
pxor xp, xp
|
||||
mov tmp, mul_array
|
||||
xor vec_i, vec_i
|
||||
|
||||
.next_vect:
|
||||
|
||||
mov ptr, [src+vec_i*PS]
|
||||
movdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
|
||||
movdqu xgft_hi, [tmp+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
|
||||
XLDR x0, [ptr+pos] ;Get next source vector
|
||||
|
||||
add tmp, 32
|
||||
add vec_i, 1
|
||||
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
|
||||
pshufb xgft_hi, x0 ;Lookup mul table of high nibble
|
||||
pshufb xgft_lo, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xgft_hi, xgft_lo ;GF add high and low partials
|
||||
pxor xp, xgft_hi ;xp += partial
|
||||
|
||||
SLDR vec, vec_m
|
||||
cmp vec_i, vec
|
||||
jl .next_vect
|
||||
|
||||
SLDR dest, dest_m
|
||||
XSTR [dest+pos], xp
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
SLDR len, len_m
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
jmp .loop16 ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_vect_dot_prod_sse, 00, 05, 0060
|
184
erasure_code/gf_vect_dot_prod_sse_perf.c
Normal file
184
erasure_code/gf_vect_dot_prod_sse_perf.c
Normal file
@ -0,0 +1,184 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_vect_dot_prod_sse
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 10
|
||||
# define TEST_LEN 8*1024
|
||||
# define TEST_LOOPS 40000
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 10
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
|
||||
# define TEST_LOOPS 100
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j;
|
||||
void *buf;
|
||||
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], *dest, *dest_ref;
|
||||
u8 *temp_buff, *buffs[TEST_SOURCES];
|
||||
struct perf start, stop;
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
temp_buff = buf;
|
||||
|
||||
// Performance test
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
memset(dest, 0, TEST_LEN);
|
||||
memset(temp_buff, 0, TEST_LEN);
|
||||
memset(dest_ref, 0, TEST_LEN);
|
||||
memset(g, 0, TEST_SOURCES);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (j = 0; j < TEST_SOURCES; j++)
|
||||
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
|
||||
|
||||
#ifdef DO_REF_PERF
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++)
|
||||
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("gf_vect_dot_prod_base" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
|
||||
#endif
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
|
||||
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++)
|
||||
gf_vect_mul_init(g[j], &g_tbls[j * 32]);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + 1) * i);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(dest, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("pass perf check\n");
|
||||
return 0;
|
||||
}
|
525
erasure_code/gf_vect_dot_prod_sse_test.c
Normal file
525
erasure_code/gf_vect_dot_prod_sse_test.c
Normal file
@ -0,0 +1,525 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_vect_dot_prod_sse
|
||||
#endif
|
||||
#ifndef TEST_MIN_SIZE
|
||||
# define TEST_MIN_SIZE 16
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 16
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 20
|
||||
#endif
|
||||
|
||||
#define MMAX TEST_SOURCES
|
||||
#define KMAX TEST_SOURCES
|
||||
|
||||
#ifdef EC_ALIGNED_ADDR
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 0
|
||||
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
|
||||
#else
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 32
|
||||
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, srcs, m, k, nerrs, r, err;
|
||||
void *buf;
|
||||
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
|
||||
u8 *dest, *dest_ref, *temp_buff, *buffs[TEST_SOURCES];
|
||||
u8 a[MMAX * KMAX], b[MMAX * KMAX], d[MMAX * KMAX];
|
||||
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
|
||||
|
||||
int align, size;
|
||||
unsigned char *efence_buffs[TEST_SOURCES];
|
||||
unsigned int offset;
|
||||
u8 *ubuffs[TEST_SOURCES];
|
||||
u8 *udest_ptr;
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
temp_buff = buf;
|
||||
|
||||
// Test of all zeros
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
memset(buffs[i], 0, TEST_LEN);
|
||||
|
||||
memset(dest, 0, TEST_LEN);
|
||||
memset(temp_buff, 0, TEST_LEN);
|
||||
memset(dest_ref, 0, TEST_LEN);
|
||||
memset(g, 0, TEST_SOURCES);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " \n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(dest, 25);
|
||||
return -1;
|
||||
} else
|
||||
putchar('.');
|
||||
|
||||
// Rand data test
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " 1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(dest, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Rand data test with varied parameters
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref);
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 2\n");
|
||||
dump_matrix(buffs, 5, srcs);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 5);
|
||||
printf("dprod:");
|
||||
dump(dest, 5);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
}
|
||||
|
||||
// Test erasure code using gf_vect_dot_prod
|
||||
|
||||
// Pick a first test
|
||||
m = 9;
|
||||
k = 5;
|
||||
if (m > MMAX || k > KMAX)
|
||||
return -1;
|
||||
|
||||
gf_gen_rs_matrix(a, m, k);
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// Make parity vects
|
||||
for (i = k; i < m; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
|
||||
#ifndef USEREF
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
|
||||
#else
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Random buffers in erasure
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
|
||||
err = 1 & rand();
|
||||
src_in_err[i] = err;
|
||||
if (err)
|
||||
src_err_list[nerrs++] = i;
|
||||
}
|
||||
|
||||
// construct b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * i + j] = a[k * r + j];
|
||||
}
|
||||
|
||||
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
|
||||
printf("BAD MATRIX\n");
|
||||
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
recov[i] = buffs[r];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
|
||||
#ifndef USEREF
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
|
||||
#else
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
|
||||
#endif
|
||||
|
||||
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buff, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Do more random tests
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
gf_gen_rs_matrix(a, m, k);
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// Make parity vects
|
||||
for (i = k; i < m; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
|
||||
#ifndef USEREF
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
|
||||
#else
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
|
||||
err = 1 & rand();
|
||||
src_in_err[i] = err;
|
||||
if (err)
|
||||
src_err_list[nerrs++] = i;
|
||||
}
|
||||
if (nerrs == 0) { // should have at least one error
|
||||
while ((err = (rand() % KMAX)) >= k) ;
|
||||
src_err_list[nerrs++] = err;
|
||||
src_in_err[err] = 1;
|
||||
}
|
||||
// construct b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * i + j] = a[k * r + j];
|
||||
}
|
||||
|
||||
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
|
||||
printf("BAD MATRIX\n");
|
||||
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
recov[i] = buffs[r];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
|
||||
#ifndef USEREF
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
|
||||
#else
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
|
||||
#endif
|
||||
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (i = 0; i < nerrs; i++)
|
||||
printf(" %d", src_err_list[i]);
|
||||
printf("\na:\n");
|
||||
dump_u8xu8((u8 *) a, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) d, k, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(buffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buff, 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
|
||||
efence_buffs[i] = buffs[i] + TEST_LEN - size;
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref);
|
||||
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 3\n");
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, align);
|
||||
printf("dprod:");
|
||||
dump(dest, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test rand ptr alignment if available
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
|
||||
srcs = rand() % TEST_SOURCES;
|
||||
if (srcs == 0)
|
||||
continue;
|
||||
|
||||
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
|
||||
// Add random offsets
|
||||
for (i = 0; i < srcs; i++)
|
||||
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
udest_ptr = dest + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
memset(dest, 0, TEST_LEN); // zero pad to check write-over
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
ubuffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptr);
|
||||
|
||||
if (memcmp(dest_ref, udest_ptr, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(udest_ptr, 25);
|
||||
return -1;
|
||||
}
|
||||
// Confirm that padding around dests is unchanged
|
||||
memset(dest_ref, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
|
||||
offset = udest_ptr - dest;
|
||||
|
||||
if (memcmp(dest, dest_ref, offset)) {
|
||||
printf("Fail rand ualign pad start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest + offset + size, dest_ref, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test all size alignment
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
|
||||
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
|
||||
srcs = TEST_SOURCES;
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest);
|
||||
|
||||
if (memcmp(dest_ref, dest, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(dest, 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("done all: Pass\n");
|
||||
return 0;
|
||||
}
|
525
erasure_code/gf_vect_dot_prod_test.c
Normal file
525
erasure_code/gf_vect_dot_prod_test.c
Normal file
@ -0,0 +1,525 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_vect_dot_prod
|
||||
#endif
|
||||
#ifndef TEST_MIN_SIZE
|
||||
# define TEST_MIN_SIZE 32
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 16
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 20
|
||||
#endif
|
||||
|
||||
#define MMAX TEST_SOURCES
|
||||
#define KMAX TEST_SOURCES
|
||||
|
||||
#ifdef EC_ALIGNED_ADDR
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 0
|
||||
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
|
||||
#else
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 32
|
||||
# define LEN_ALIGN_CHK_B 32 // 0 for aligned only
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, srcs, m, k, nerrs, r, err;
|
||||
void *buf;
|
||||
u8 g[TEST_SOURCES], g_tbls[TEST_SOURCES * 32], src_in_err[TEST_SOURCES];
|
||||
u8 *dest, *dest_ref, *temp_buff, *buffs[TEST_SOURCES];
|
||||
u8 a[MMAX * KMAX], b[MMAX * KMAX], d[MMAX * KMAX];
|
||||
u8 src_err_list[TEST_SOURCES], *recov[TEST_SOURCES];
|
||||
|
||||
int align, size;
|
||||
unsigned char *efence_buffs[TEST_SOURCES];
|
||||
unsigned int offset;
|
||||
u8 *ubuffs[TEST_SOURCES];
|
||||
u8 *udest_ptr;
|
||||
|
||||
printf(xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref = buf;
|
||||
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
temp_buff = buf;
|
||||
|
||||
// Test of all zeros
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
memset(buffs[i], 0, TEST_LEN);
|
||||
|
||||
memset(dest, 0, TEST_LEN);
|
||||
memset(temp_buff, 0, TEST_LEN);
|
||||
memset(dest_ref, 0, TEST_LEN);
|
||||
memset(g, 0, TEST_SOURCES);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
|
||||
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " \n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(dest, 25);
|
||||
return -1;
|
||||
} else
|
||||
putchar('.');
|
||||
|
||||
// Rand data test
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[0], buffs, dest_ref);
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " 1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(dest, 25);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Rand data test with varied parameters
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[0], buffs, dest_ref);
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, srcs, g_tbls, buffs, dest);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 2\n");
|
||||
dump_matrix(buffs, 5, srcs);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 5);
|
||||
printf("dprod:");
|
||||
dump(dest, 5);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
}
|
||||
|
||||
// Test erasure code using gf_vect_dot_prod
|
||||
|
||||
// Pick a first test
|
||||
m = 9;
|
||||
k = 5;
|
||||
if (m > MMAX || k > KMAX)
|
||||
return -1;
|
||||
|
||||
gf_gen_rs_matrix(a, m, k);
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// Make parity vects
|
||||
for (i = k; i < m; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
|
||||
#ifndef USEREF
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
|
||||
#else
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Random buffers in erasure
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
|
||||
err = 1 & rand();
|
||||
src_in_err[i] = err;
|
||||
if (err)
|
||||
src_err_list[nerrs++] = i;
|
||||
}
|
||||
|
||||
// construct b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * i + j] = a[k * r + j];
|
||||
}
|
||||
|
||||
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
|
||||
printf("BAD MATRIX\n");
|
||||
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
recov[i] = buffs[r];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
|
||||
#ifndef USEREF
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
|
||||
#else
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
|
||||
#endif
|
||||
|
||||
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d)\n", m, k, nerrs);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buff, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Do more random tests
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
while ((m = (rand() % MMAX)) < 2) ;
|
||||
while ((k = (rand() % KMAX)) >= m || k < 1) ;
|
||||
|
||||
if (m > MMAX || k > KMAX)
|
||||
continue;
|
||||
|
||||
gf_gen_rs_matrix(a, m, k);
|
||||
|
||||
// Make random data
|
||||
for (i = 0; i < k; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
// Make parity vects
|
||||
for (i = k; i < m; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(a[k * i + j], &g_tbls[j * 32]);
|
||||
#ifndef USEREF
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, buffs, buffs[i]);
|
||||
#else
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], buffs, buffs[i]);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Random errors
|
||||
memset(src_in_err, 0, TEST_SOURCES);
|
||||
for (i = 0, nerrs = 0; i < k && nerrs < m - k; i++) {
|
||||
err = 1 & rand();
|
||||
src_in_err[i] = err;
|
||||
if (err)
|
||||
src_err_list[nerrs++] = i;
|
||||
}
|
||||
if (nerrs == 0) { // should have at least one error
|
||||
while ((err = (rand() % KMAX)) >= k) ;
|
||||
src_err_list[nerrs++] = err;
|
||||
src_in_err[err] = 1;
|
||||
}
|
||||
// construct b by removing error rows
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
for (j = 0; j < k; j++)
|
||||
b[k * i + j] = a[k * r + j];
|
||||
}
|
||||
|
||||
if (gf_invert_matrix((u8 *) b, (u8 *) d, k) < 0)
|
||||
printf("BAD MATRIX\n");
|
||||
|
||||
for (i = 0, r = 0; i < k; i++, r++) {
|
||||
while (src_in_err[r]) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
recov[i] = buffs[r];
|
||||
}
|
||||
|
||||
// Recover data
|
||||
for (i = 0; i < nerrs; i++) {
|
||||
for (j = 0; j < k; j++)
|
||||
gf_vect_mul_init(d[k * src_err_list[i] + j], &g_tbls[j * 32]);
|
||||
#ifndef USEREF
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, k, g_tbls, recov, temp_buff);
|
||||
#else
|
||||
gf_vect_dot_prod_base(TEST_LEN, k, &g_tbls[0], recov, temp_buff);
|
||||
#endif
|
||||
if (0 != memcmp(temp_buff, buffs[src_err_list[i]], TEST_LEN)) {
|
||||
printf("Fail error recovery (%d, %d, %d) - ", m, k, nerrs);
|
||||
printf(" - erase list = ");
|
||||
for (i = 0; i < nerrs; i++)
|
||||
printf(" %d", src_err_list[i]);
|
||||
printf("\na:\n");
|
||||
dump_u8xu8((u8 *) a, m, k);
|
||||
printf("inv b:\n");
|
||||
dump_u8xu8((u8 *) d, k, k);
|
||||
printf("orig data:\n");
|
||||
dump_matrix(buffs, m, 25);
|
||||
printf("orig :");
|
||||
dump(buffs[src_err_list[i]], 25);
|
||||
printf("recov %d:", src_err_list[i]);
|
||||
dump(temp_buff, 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
|
||||
efence_buffs[i] = buffs[i] + TEST_LEN - size;
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES, &g_tbls[0], efence_buffs, dest_ref);
|
||||
FUNCTION_UNDER_TEST(size, TEST_SOURCES, g_tbls, efence_buffs, dest);
|
||||
|
||||
if (0 != memcmp(dest_ref, dest, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test 3\n");
|
||||
dump_matrix(efence_buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, align);
|
||||
printf("dprod:");
|
||||
dump(dest, align);
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test rand ptr alignment if available
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
|
||||
srcs = rand() % TEST_SOURCES;
|
||||
if (srcs == 0)
|
||||
continue;
|
||||
|
||||
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
|
||||
// Add random offsets
|
||||
for (i = 0; i < srcs; i++)
|
||||
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
udest_ptr = dest + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
memset(dest, 0, TEST_LEN); // zero pad to check write-over
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
ubuffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], ubuffs, dest_ref);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, ubuffs, udest_ptr);
|
||||
|
||||
if (memcmp(dest_ref, udest_ptr, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign srcs=%d\n",
|
||||
srcs);
|
||||
dump_matrix(ubuffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(udest_ptr, 25);
|
||||
return -1;
|
||||
}
|
||||
// Confirm that padding around dests is unchanged
|
||||
memset(dest_ref, 0, PTR_ALIGN_CHK_B); // Make reference zero buff
|
||||
offset = udest_ptr - dest;
|
||||
|
||||
if (memcmp(dest, dest_ref, offset)) {
|
||||
printf("Fail rand ualign pad start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp(dest + offset + size, dest_ref, PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad end\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test all size alignment
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : 16;
|
||||
|
||||
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
|
||||
srcs = TEST_SOURCES;
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
g[i] = rand();
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
gf_vect_mul_init(g[i], &g_tbls[i * 32]);
|
||||
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[0], buffs, dest_ref);
|
||||
|
||||
FUNCTION_UNDER_TEST(size, srcs, g_tbls, buffs, dest);
|
||||
|
||||
if (memcmp(dest_ref, dest, size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " ualign len=%d\n",
|
||||
size);
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref, 25);
|
||||
printf("dprod:");
|
||||
dump(dest, 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("done all: Pass\n");
|
||||
return 0;
|
||||
}
|
196
erasure_code/gf_vect_mad_avx.asm
Normal file
196
erasure_code/gf_vect_mad_avx.asm
Normal file
@ -0,0 +1,196 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define PS 8
|
||||
%define stack_size 16*3 + 3*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
vmovdqa [rsp+16*0],xmm6
|
||||
vmovdqa [rsp+16*1],xmm7
|
||||
vmovdqa [rsp+16*2],xmm8
|
||||
save_reg r12, 3*16 + 0*8
|
||||
save_reg r15, 3*16 + 1*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp+16*0]
|
||||
vmovdqa xmm7, [rsp+16*1]
|
||||
vmovdqa xmm8, [rsp+16*2]
|
||||
mov r12, [rsp + 3*16 + 0*8]
|
||||
mov r15, [rsp + 3*16 + 1*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
;;; gf_vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm8
|
||||
%define xgft_lo xmm7
|
||||
%define xgft_hi xmm6
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xtmph xmm2
|
||||
%define xtmpl xmm3
|
||||
%define xd xmm4
|
||||
%define xtmpd xmm5
|
||||
|
||||
align 16
|
||||
global gf_vect_mad_avx:function
|
||||
func(gf_vect_mad_avx)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
|
||||
xor pos, pos
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
vmovdqu xgft_lo, [vec_i+mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
vmovdqu xgft_hi, [vec_i+mul_array+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
|
||||
XLDR xtmpd, [dest+len] ;backup the last 16 bytes in dest
|
||||
|
||||
.loop16:
|
||||
XLDR xd, [dest+pos] ;Get next dest vector
|
||||
.loop16_overlap:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vpshufb xtmph, xgft_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
||||
vpxor xd, xd, xtmph ;xd += partial
|
||||
|
||||
XSTR [dest+pos], xd
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
vmovdqa xd, xtmpd ;Restore xd
|
||||
jmp .loop16_overlap ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_vect_mad_avx, 02, 01, 0201
|
203
erasure_code/gf_vect_mad_avx2.asm
Normal file
203
erasure_code/gf_vect_mad_avx2.asm
Normal file
@ -0,0 +1,203 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12 ; must be saved and loaded
|
||||
%define arg5 r15
|
||||
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define PS 8
|
||||
%define stack_size 16*3 + 3*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
vmovdqa [rsp+16*0],xmm6
|
||||
vmovdqa [rsp+16*1],xmm7
|
||||
vmovdqa [rsp+16*2],xmm8
|
||||
save_reg r12, 3*16 + 0*8
|
||||
save_reg r15, 3*16 + 1*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp+16*0]
|
||||
vmovdqa xmm7, [rsp+16*1]
|
||||
vmovdqa xmm8, [rsp+16*2]
|
||||
mov r12, [rsp + 3*16 + 0*8]
|
||||
mov r15, [rsp + 3*16 + 1*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
|
||||
%define tmp r11
|
||||
%define tmp.w r11d
|
||||
%define tmp.b r11b
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
|
||||
;;; gf_vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR vmovdqu
|
||||
%define XSTR vmovdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f ymm8
|
||||
%define xmask0fx xmm8
|
||||
%define xgft_lo ymm7
|
||||
%define xgft_hi ymm6
|
||||
|
||||
%define x0 ymm0
|
||||
%define xtmpa ymm1
|
||||
%define xtmph ymm2
|
||||
%define xtmpl ymm3
|
||||
%define xd ymm4
|
||||
%define xtmpd ymm5
|
||||
|
||||
align 16
|
||||
global gf_vect_mad_avx2:function
|
||||
func(gf_vect_mad_avx2)
|
||||
FUNC_SAVE
|
||||
sub len, 32
|
||||
jl .return_fail
|
||||
xor pos, pos
|
||||
mov tmp.b, 0x0f
|
||||
vpinsrb xmask0fx, xmask0fx, tmp.w, 0
|
||||
vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
|
||||
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
vmovdqu xgft_lo, [vec_i+mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi
|
||||
vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
|
||||
|
||||
XLDR xtmpd, [dest+len] ;backup the last 32 bytes in dest
|
||||
|
||||
.loop32:
|
||||
XLDR xd, [dest+pos] ;Get next dest vector
|
||||
.loop32_overlap:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
|
||||
vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
|
||||
vpshufb xtmph, xgft_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmpl, xgft_lo, xtmpa ;Lookup mul table of low nibble
|
||||
vpxor xtmph, xtmph, xtmpl ;GF add high and low partials
|
||||
vpxor xd, xd, xtmph ;xd += partial
|
||||
|
||||
XSTR [dest+pos], xd
|
||||
add pos, 32 ;Loop on 32 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop32
|
||||
|
||||
lea tmp, [len + 32]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-32
|
||||
vmovdqa xd, xtmpd ;Restore xd
|
||||
jmp .loop32_overlap ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_vect_mad_avx2, 04, 01, 0202
|
374
erasure_code/gf_vect_mad_perf.c
Normal file
374
erasure_code/gf_vect_mad_perf.c
Normal file
@ -0,0 +1,374 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
#include "test.h"
|
||||
|
||||
//By default, test sse version
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_4vect_mad_sse
|
||||
# define REF_FUNCTION gf_4vect_dot_prod_sse
|
||||
# define VECT 4
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_SOURCES 10
|
||||
# define TEST_LEN 8*1024
|
||||
# define TEST_LOOPS 40000
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 10
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1))
|
||||
# define TEST_LOOPS 100
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, l;
|
||||
void *buf;
|
||||
u8 gf[6][TEST_SOURCES];
|
||||
u8 *g_tbls;
|
||||
u8 *dest_ref[VECT];
|
||||
u8 *dest_ptrs[VECT], *buffs[TEST_SOURCES];
|
||||
u8 *dest_perf_ptrs[VECT];
|
||||
struct perf start, stop;
|
||||
|
||||
printf("test " xstr(FUNCTION_UNDER_TEST) ": %dx%d\n", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 16, VECT * TEST_SOURCES * 32)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
g_tbls = buf;
|
||||
|
||||
for (i = 0; i < VECT; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ptrs[i] = buf;
|
||||
memset(dest_ptrs[i], 0, TEST_LEN);
|
||||
}
|
||||
|
||||
for (i = 0; i < VECT; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref[i] = buf;
|
||||
memset(dest_ref[i], 0, TEST_LEN);
|
||||
}
|
||||
|
||||
for (i = 0; i < VECT; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_perf_ptrs[i] = buf;
|
||||
memset(dest_perf_ptrs[i], 0, TEST_LEN);
|
||||
}
|
||||
|
||||
// Performance test
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < VECT; i++)
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf[i][j] = rand();
|
||||
gf_vect_mul_init(gf[i][j], &g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
|
||||
}
|
||||
|
||||
for (i = 0; i < VECT; i++)
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[i * 32 * TEST_SOURCES],
|
||||
buffs, dest_ref[i]);
|
||||
|
||||
for (i = 0; i < VECT; i++)
|
||||
memset(dest_ptrs[i], 0, TEST_LEN);
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
#if (VECT == 1)
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], *dest_ptrs);
|
||||
#else
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], dest_ptrs);
|
||||
#endif
|
||||
}
|
||||
for (i = 0; i < VECT; i++) {
|
||||
if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref[i], 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
#if (VECT == 1)
|
||||
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, *dest_ref);
|
||||
#else
|
||||
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ref);
|
||||
#endif
|
||||
for (i = 0; i < VECT; i++) {
|
||||
if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
|
||||
printf("Fail perf " xstr(FUNCTION_UNDER_TEST) " test1\n");
|
||||
dump_matrix(buffs, 5, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref[i], 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DO_REF_PERF
|
||||
|
||||
#if (VECT == 1)
|
||||
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, *dest_ref);
|
||||
#else
|
||||
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ref);
|
||||
#endif
|
||||
perf_start(&start);
|
||||
for (l = 0; l < TEST_LOOPS; l++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
#if (VECT == 1)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
#elif (VECT == 2)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
#elif (VECT == 3)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
#elif (VECT == 4)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
#elif (VECT == 5)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
|
||||
#elif (VECT == 6)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[5][j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if (VECT == 1)
|
||||
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, *dest_ref);
|
||||
#else
|
||||
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ref);
|
||||
#endif
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(REF_FUNCTION) TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + VECT) * TEST_LOOPS);
|
||||
|
||||
#endif
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
#if (VECT == 1)
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
|
||||
*dest_perf_ptrs);
|
||||
#else
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
|
||||
dest_perf_ptrs);
|
||||
#endif
|
||||
}
|
||||
perf_start(&start);
|
||||
for (l = 0; l < TEST_LOOPS; l++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
#if (VECT == 1)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
#elif (VECT == 2)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
#elif (VECT == 3)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
#elif (VECT == 4)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
#elif (VECT == 5)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
|
||||
#elif (VECT == 6)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[5][j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
|
||||
#endif
|
||||
}
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
#if (VECT == 1)
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
|
||||
*dest_perf_ptrs);
|
||||
#else
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
|
||||
dest_perf_ptrs);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (TEST_SOURCES + VECT) * TEST_LOOPS);
|
||||
|
||||
perf_start(&start);
|
||||
for (l = 0; l < TEST_LOOPS; l++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
#if (VECT == 1)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
#elif (VECT == 2)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
#elif (VECT == 3)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
#elif (VECT == 4)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
#elif (VECT == 5)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
|
||||
#elif (VECT == 6)
|
||||
gf_vect_mul_init(gf[0][j], &g_tbls[j * 32]);
|
||||
gf_vect_mul_init(gf[1][j], &g_tbls[(32 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[2][j], &g_tbls[(64 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[3][j], &g_tbls[(96 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[4][j], &g_tbls[(128 * TEST_SOURCES) + (j * 32)]);
|
||||
gf_vect_mul_init(gf[5][j], &g_tbls[(160 * TEST_SOURCES) + (j * 32)]);
|
||||
#endif
|
||||
}
|
||||
#if (VECT == 1)
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, 0, g_tbls, buffs[0],
|
||||
*dest_perf_ptrs);
|
||||
#else
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, 0, g_tbls, buffs[0],
|
||||
dest_perf_ptrs);
|
||||
#endif
|
||||
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) "_single_src" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (1 + VECT) * TEST_LOOPS);
|
||||
|
||||
perf_start(&start);
|
||||
for (l = 0; l < TEST_LOOPS; l++) {
|
||||
#if (VECT == 1)
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, 0, g_tbls, buffs[0],
|
||||
*dest_perf_ptrs);
|
||||
#else
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, 0, g_tbls, buffs[0],
|
||||
dest_perf_ptrs);
|
||||
#endif
|
||||
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf(xstr(FUNCTION_UNDER_TEST) "_single_src_simple" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * (1 + VECT) * TEST_LOOPS);
|
||||
|
||||
printf("pass perf check\n");
|
||||
return 0;
|
||||
|
||||
}
|
197
erasure_code/gf_vect_mad_sse.asm
Normal file
197
erasure_code/gf_vect_mad_sse.asm
Normal file
@ -0,0 +1,197 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg0.w ecx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define arg4 r12
|
||||
%define arg5 r15
|
||||
%define tmp r11
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
%define PS 8
|
||||
%define stack_size 16*3 + 3*8
|
||||
%define arg(x) [rsp + stack_size + PS + PS*x]
|
||||
%define func(x) proc_frame x
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
movdqa [rsp+16*0],xmm6
|
||||
movdqa [rsp+16*1],xmm7
|
||||
movdqa [rsp+16*2],xmm8
|
||||
save_reg r12, 3*16 + 0*8
|
||||
save_reg r15, 3*16 + 1*8
|
||||
end_prolog
|
||||
mov arg4, arg(4)
|
||||
mov arg5, arg(5)
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp+16*0]
|
||||
movdqa xmm7, [rsp+16*1]
|
||||
movdqa xmm8, [rsp+16*2]
|
||||
mov r12, [rsp + 3*16 + 0*8]
|
||||
mov r15, [rsp + 3*16 + 1*8]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg0.w edi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define return rax
|
||||
%define return.w eax
|
||||
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
%endif
|
||||
|
||||
;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
|
||||
%define len arg0
|
||||
%define len.w arg0.w
|
||||
%define vec arg1
|
||||
%define vec_i arg2
|
||||
%define mul_array arg3
|
||||
%define src arg4
|
||||
%define dest arg5
|
||||
%define pos return
|
||||
%define pos.w return.w
|
||||
|
||||
%ifndef EC_ALIGNED_ADDR
|
||||
;;; Use Un-aligned load/store
|
||||
%define XLDR movdqu
|
||||
%define XSTR movdqu
|
||||
%else
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR movdqa
|
||||
%define XSTR movdqa
|
||||
%else
|
||||
%define XLDR movntdqa
|
||||
%define XSTR movntdq
|
||||
%endif
|
||||
%endif
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm8
|
||||
%define xgft_lo xmm7
|
||||
%define xgft_hi xmm6
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmpa xmm1
|
||||
%define xtmph xmm2
|
||||
%define xtmpl xmm3
|
||||
%define xd xmm4
|
||||
%define xtmpd xmm5
|
||||
|
||||
|
||||
align 16
|
||||
global gf_vect_mad_sse:function
|
||||
func(gf_vect_mad_sse)
|
||||
FUNC_SAVE
|
||||
sub len, 16
|
||||
jl .return_fail
|
||||
|
||||
xor pos, pos
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
sal vec_i, 5 ;Multiply by 32
|
||||
movdqu xgft_lo, [vec_i+mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
movdqu xgft_hi, [vec_i+mul_array+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
|
||||
XLDR xtmpd, [dest+len] ;backup the last 16 bytes in dest
|
||||
|
||||
.loop16:
|
||||
XLDR xd, [dest+pos] ;Get next dest vector
|
||||
.loop16_overlap:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
movdqa xtmph, xgft_hi ;Reload const array registers
|
||||
movdqa xtmpl, xgft_lo
|
||||
movdqa xtmpa, x0 ;Keep unshifted copy of src
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0
|
||||
pshufb xtmph, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmpl, xtmpa ;Lookup mul table of low nibble
|
||||
pxor xtmph, xtmpl ;GF add high and low partials
|
||||
|
||||
pxor xd, xtmph
|
||||
XSTR [dest+pos], xd ;Store result
|
||||
|
||||
add pos, 16 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
jle .loop16
|
||||
|
||||
lea tmp, [len + 16]
|
||||
cmp pos, tmp
|
||||
je .return_pass
|
||||
|
||||
;; Tail len
|
||||
mov pos, len ;Overlapped offset length-16
|
||||
movdqa xd, xtmpd ;Restore xd
|
||||
jmp .loop16_overlap ;Do one more overlap pass
|
||||
|
||||
.return_pass:
|
||||
mov return, 0
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
.return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
|
||||
mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_vect_mad_sse, 00, 01, 0200
|
508
erasure_code/gf_vect_mad_test.c
Normal file
508
erasure_code/gf_vect_mad_test.c
Normal file
@ -0,0 +1,508 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset, memcmp
|
||||
#include "erasure_code.h"
|
||||
#include "types.h"
|
||||
|
||||
#ifndef ALIGN_SIZE
|
||||
# define ALIGN_SIZE 16
|
||||
#endif
|
||||
|
||||
//By default, test sse version
|
||||
#ifndef FUNCTION_UNDER_TEST
|
||||
# define FUNCTION_UNDER_TEST gf_6vect_mad_sse
|
||||
# define REF_FUNCTION gf_6vect_dot_prod_sse
|
||||
# define VECT 6
|
||||
#endif
|
||||
|
||||
#ifndef TEST_MIN_SIZE
|
||||
# define TEST_MIN_SIZE ALIGN_SIZE
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
#define TEST_LEN 8192
|
||||
#define TEST_SIZE (TEST_LEN/2)
|
||||
#define TEST_MEM TEST_SIZE
|
||||
#define TEST_LOOPS 20000
|
||||
#define TEST_TYPE_STR ""
|
||||
|
||||
#ifndef TEST_SOURCES
|
||||
# define TEST_SOURCES 16
|
||||
#endif
|
||||
#ifndef RANDOMS
|
||||
# define RANDOMS 20
|
||||
#endif
|
||||
|
||||
#ifdef EC_ALIGNED_ADDR
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B 0
|
||||
# define LEN_ALIGN_CHK_B 0 // 0 for aligned only
|
||||
#else
|
||||
// Define power of 2 range to check ptr, len alignment
|
||||
# define PTR_ALIGN_CHK_B ALIGN_SIZE
|
||||
# define LEN_ALIGN_CHK_B ALIGN_SIZE // 0 for aligned only
|
||||
#endif
|
||||
|
||||
#define str(s) #s
|
||||
#define xstr(s) str(s)
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
void dump(unsigned char *buf, int len)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len;) {
|
||||
printf(" %2x", 0xff & buf[i++]);
|
||||
if (i % 32 == 0)
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_matrix(unsigned char **s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", s[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void dump_u8xu8(unsigned char *s, int k, int m)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < k; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
printf(" %2x", 0xff & s[j + (i * m)]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, rtest, srcs;
|
||||
void *buf;
|
||||
u8 gf[6][TEST_SOURCES];
|
||||
u8 *g_tbls;
|
||||
u8 *dest_ref[VECT];
|
||||
u8 *dest_ptrs[VECT], *buffs[TEST_SOURCES];
|
||||
int vector = VECT;
|
||||
|
||||
int align, size;
|
||||
unsigned char *efence_buffs[TEST_SOURCES];
|
||||
unsigned int offset;
|
||||
u8 *ubuffs[TEST_SOURCES];
|
||||
u8 *udest_ptrs[VECT];
|
||||
printf("test" xstr(FUNCTION_UNDER_TEST) ": %dx%d ", TEST_SOURCES, TEST_LEN);
|
||||
|
||||
// Allocate the arrays
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
buffs[i] = buf;
|
||||
}
|
||||
|
||||
if (posix_memalign(&buf, 16, 2 * (vector * TEST_SOURCES * 32))) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
g_tbls = buf;
|
||||
|
||||
for (i = 0; i < vector; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ptrs[i] = buf;
|
||||
memset(dest_ptrs[i], 0, TEST_LEN);
|
||||
}
|
||||
|
||||
for (i = 0; i < vector; i++) {
|
||||
if (posix_memalign(&buf, 64, TEST_LEN)) {
|
||||
printf("alloc error: Fail");
|
||||
return -1;
|
||||
}
|
||||
dest_ref[i] = buf;
|
||||
memset(dest_ref[i], 0, TEST_LEN);
|
||||
}
|
||||
|
||||
// Test of all zeros
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
memset(buffs[i], 0, TEST_LEN);
|
||||
|
||||
switch (vector) {
|
||||
case 6:
|
||||
memset(gf[5], 0xe6, TEST_SOURCES);
|
||||
case 5:
|
||||
memset(gf[4], 4, TEST_SOURCES);
|
||||
case 4:
|
||||
memset(gf[3], 9, TEST_SOURCES);
|
||||
case 3:
|
||||
memset(gf[2], 7, TEST_SOURCES);
|
||||
case 2:
|
||||
memset(gf[1], 1, TEST_SOURCES);
|
||||
case 1:
|
||||
memset(gf[0], 2, TEST_SOURCES);
|
||||
break;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf[i][j] = rand();
|
||||
gf_vect_mul_init(gf[i][j], &g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
|
||||
}
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES, &g_tbls[i * 32 * TEST_SOURCES],
|
||||
buffs, dest_ref[i]);
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
memset(dest_ptrs[i], 0, TEST_LEN);
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
#if (VECT == 1)
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], *dest_ptrs);
|
||||
#else
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i], dest_ptrs);
|
||||
#endif
|
||||
}
|
||||
for (i = 0; i < vector; i++) {
|
||||
if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test%d\n", i);
|
||||
dump_matrix(buffs, vector, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref[i], 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
#if (VECT == 1)
|
||||
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, *dest_ref);
|
||||
#else
|
||||
REF_FUNCTION(TEST_LEN, TEST_SOURCES, g_tbls, buffs, dest_ref);
|
||||
#endif
|
||||
for (i = 0; i < vector; i++) {
|
||||
if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
|
||||
printf("Fail zero " xstr(FUNCTION_UNDER_TEST) " test%d\n", i);
|
||||
dump_matrix(buffs, vector, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref[i], 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
|
||||
// Rand data test
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf[i][j] = rand();
|
||||
gf_vect_mul_init(gf[i][j],
|
||||
&g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
|
||||
}
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
gf_vect_dot_prod_base(TEST_LEN, TEST_SOURCES,
|
||||
&g_tbls[i * 32 * TEST_SOURCES], buffs,
|
||||
dest_ref[i]);
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
memset(dest_ptrs[i], 0, TEST_LEN);
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
#if (VECT == 1)
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
|
||||
*dest_ptrs);
|
||||
#else
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, TEST_SOURCES, i, g_tbls, buffs[i],
|
||||
dest_ptrs);
|
||||
#endif
|
||||
}
|
||||
for (i = 0; i < vector; i++) {
|
||||
if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST) " test%d %d\n",
|
||||
i, rtest);
|
||||
dump_matrix(buffs, vector, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref[i], 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Rand data test with varied parameters
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
for (srcs = TEST_SOURCES; srcs > 0; srcs--) {
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
for (j = 0; j < srcs; j++) {
|
||||
gf[i][j] = rand();
|
||||
gf_vect_mul_init(gf[i][j],
|
||||
&g_tbls[i * (32 * srcs) + j * 32]);
|
||||
}
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
gf_vect_dot_prod_base(TEST_LEN, srcs, &g_tbls[i * 32 * srcs],
|
||||
buffs, dest_ref[i]);
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
memset(dest_ptrs[i], 0, TEST_LEN);
|
||||
for (i = 0; i < srcs; i++) {
|
||||
#if (VECT == 1)
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, srcs, i, g_tbls, buffs[i],
|
||||
*dest_ptrs);
|
||||
#else
|
||||
FUNCTION_UNDER_TEST(TEST_LEN, srcs, i, g_tbls, buffs[i],
|
||||
dest_ptrs);
|
||||
#endif
|
||||
|
||||
}
|
||||
for (i = 0; i < vector; i++) {
|
||||
if (0 != memcmp(dest_ref[i], dest_ptrs[i], TEST_LEN)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test%d srcs=%d\n", i, srcs);
|
||||
dump_matrix(buffs, vector, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref[i], 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : ALIGN_SIZE;
|
||||
for (size = TEST_MIN_SIZE; size <= TEST_SIZE; size += align) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < TEST_LEN; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) // Line up TEST_SIZE from end
|
||||
efence_buffs[i] = buffs[i] + TEST_LEN - size;
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf[i][j] = rand();
|
||||
gf_vect_mul_init(gf[i][j],
|
||||
&g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
|
||||
}
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES,
|
||||
&g_tbls[i * 32 * TEST_SOURCES], efence_buffs,
|
||||
dest_ref[i]);
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
memset(dest_ptrs[i], 0, size);
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
#if (VECT == 1)
|
||||
FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, efence_buffs[i],
|
||||
*dest_ptrs);
|
||||
#else
|
||||
FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, efence_buffs[i],
|
||||
dest_ptrs);
|
||||
#endif
|
||||
}
|
||||
for (i = 0; i < vector; i++) {
|
||||
if (0 != memcmp(dest_ref[i], dest_ptrs[i], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test%d size=%d\n", i, size);
|
||||
dump_matrix(buffs, vector, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref[i], TEST_MIN_SIZE + align);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[i], TEST_MIN_SIZE + align);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test rand ptr alignment if available
|
||||
|
||||
for (rtest = 0; rtest < RANDOMS; rtest++) {
|
||||
size = (TEST_LEN - PTR_ALIGN_CHK_B) & ~(TEST_MIN_SIZE - 1);
|
||||
srcs = rand() % TEST_SOURCES;
|
||||
if (srcs == 0)
|
||||
continue;
|
||||
|
||||
offset = (PTR_ALIGN_CHK_B != 0) ? 1 : PTR_ALIGN_CHK_B;
|
||||
// Add random offsets
|
||||
for (i = 0; i < srcs; i++)
|
||||
ubuffs[i] = buffs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
|
||||
for (i = 0; i < vector; i++) {
|
||||
udest_ptrs[i] = dest_ptrs[i] + (rand() & (PTR_ALIGN_CHK_B - offset));
|
||||
memset(dest_ptrs[i], 0, TEST_LEN); // zero pad to check write-over
|
||||
}
|
||||
|
||||
for (i = 0; i < srcs; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
ubuffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
for (j = 0; j < srcs; j++) {
|
||||
gf[i][j] = rand();
|
||||
gf_vect_mul_init(gf[i][j], &g_tbls[i * (32 * srcs) + j * 32]);
|
||||
}
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
gf_vect_dot_prod_base(size, srcs, &g_tbls[i * 32 * srcs], ubuffs,
|
||||
dest_ref[i]);
|
||||
|
||||
for (i = 0; i < srcs; i++) {
|
||||
#if (VECT == 1)
|
||||
FUNCTION_UNDER_TEST(size, srcs, i, g_tbls, ubuffs[i], *udest_ptrs);
|
||||
#else
|
||||
FUNCTION_UNDER_TEST(size, srcs, i, g_tbls, ubuffs[i], udest_ptrs);
|
||||
#endif
|
||||
}
|
||||
for (i = 0; i < vector; i++) {
|
||||
if (0 != memcmp(dest_ref[i], udest_ptrs[i], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test%d ualign srcs=%d\n", i, srcs);
|
||||
dump_matrix(buffs, vector, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref[i], 25);
|
||||
printf("dprod_dut:");
|
||||
dump(udest_ptrs[i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Confirm that padding around dests is unchanged
|
||||
memset(dest_ref[0], 0, PTR_ALIGN_CHK_B); // Make reference zero buff
|
||||
|
||||
for (i = 0; i < vector; i++) {
|
||||
offset = udest_ptrs[i] - dest_ptrs[i];
|
||||
if (memcmp(dest_ptrs[i], dest_ref[0], offset)) {
|
||||
printf("Fail rand ualign pad1 start\n");
|
||||
return -1;
|
||||
}
|
||||
if (memcmp
|
||||
(dest_ptrs[i] + offset + size, dest_ref[0],
|
||||
PTR_ALIGN_CHK_B - offset)) {
|
||||
printf("Fail rand ualign pad1 end\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Test all size alignment
|
||||
align = (LEN_ALIGN_CHK_B != 0) ? 1 : ALIGN_SIZE;
|
||||
|
||||
for (size = TEST_LEN; size >= TEST_MIN_SIZE; size -= align) {
|
||||
for (i = 0; i < TEST_SOURCES; i++)
|
||||
for (j = 0; j < size; j++)
|
||||
buffs[i][j] = rand();
|
||||
|
||||
for (i = 0; i < vector; i++) {
|
||||
for (j = 0; j < TEST_SOURCES; j++) {
|
||||
gf[i][j] = rand();
|
||||
gf_vect_mul_init(gf[i][j],
|
||||
&g_tbls[i * (32 * TEST_SOURCES) + j * 32]);
|
||||
}
|
||||
memset(dest_ptrs[i], 0, TEST_LEN); // zero pad to check write-over
|
||||
}
|
||||
|
||||
for (i = 0; i < vector; i++)
|
||||
gf_vect_dot_prod_base(size, TEST_SOURCES,
|
||||
&g_tbls[i * 32 * TEST_SOURCES], buffs,
|
||||
dest_ref[i]);
|
||||
|
||||
for (i = 0; i < TEST_SOURCES; i++) {
|
||||
#if (VECT == 1)
|
||||
FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, buffs[i],
|
||||
*dest_ptrs);
|
||||
#else
|
||||
FUNCTION_UNDER_TEST(size, TEST_SOURCES, i, g_tbls, buffs[i],
|
||||
dest_ptrs);
|
||||
#endif
|
||||
}
|
||||
for (i = 0; i < vector; i++) {
|
||||
if (0 != memcmp(dest_ref[i], dest_ptrs[i], size)) {
|
||||
printf("Fail rand " xstr(FUNCTION_UNDER_TEST)
|
||||
" test%d ualign len=%d\n", i, size);
|
||||
dump_matrix(buffs, vector, TEST_SOURCES);
|
||||
printf("dprod_base:");
|
||||
dump(dest_ref[i], 25);
|
||||
printf("dprod_dut:");
|
||||
dump(dest_ptrs[i], 25);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
|
||||
}
|
||||
|
||||
printf("Pass\n");
|
||||
return 0;
|
||||
|
||||
}
|
164
erasure_code/gf_vect_mul_avx.asm
Normal file
164
erasure_code/gf_vect_mul_avx.asm
Normal file
@ -0,0 +1,164 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_vect_mul_avx(len, mul_array, src, dest)
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define return rax
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define return rax
|
||||
%define stack_size 5*16 + 8 ; must be an odd multiple of 8
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
save_xmm128 xmm6, 0*16
|
||||
save_xmm128 xmm7, 1*16
|
||||
save_xmm128 xmm13, 2*16
|
||||
save_xmm128 xmm14, 3*16
|
||||
save_xmm128 xmm15, 4*16
|
||||
end_prolog
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
vmovdqa xmm6, [rsp + 0*16]
|
||||
vmovdqa xmm7, [rsp + 1*16]
|
||||
vmovdqa xmm13, [rsp + 2*16]
|
||||
vmovdqa xmm14, [rsp + 3*16]
|
||||
vmovdqa xmm15, [rsp + 4*16]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%endif
|
||||
|
||||
|
||||
%define len arg0
|
||||
%define mul_array arg1
|
||||
%define src arg2
|
||||
%define dest arg3
|
||||
%define pos return
|
||||
|
||||
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR vmovdqa
|
||||
%define XSTR vmovdqa
|
||||
%else
|
||||
%define XLDR vmovntdqa
|
||||
%define XSTR vmovntdq
|
||||
%endif
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft_lo xmm14
|
||||
%define xgft_hi xmm13
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmp1a xmm1
|
||||
%define xtmp1b xmm2
|
||||
%define xtmp1c xmm3
|
||||
%define x1 xmm4
|
||||
%define xtmp2a xmm5
|
||||
%define xtmp2b xmm6
|
||||
%define xtmp2c xmm7
|
||||
|
||||
align 16
|
||||
global gf_vect_mul_avx:function
|
||||
func(gf_vect_mul_avx)
|
||||
FUNC_SAVE
|
||||
mov pos, 0
|
||||
vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
vmovdqu xgft_lo, [mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
vmovdqu xgft_hi, [mul_array+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
|
||||
loop32:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
XLDR x1, [src+pos+16] ;Get next source vector + 16B ahead
|
||||
add pos, 32 ;Loop on 16 bytes at a time
|
||||
cmp pos, len
|
||||
vpand xtmp1a, x0, xmask0f ;Mask low src nibble in bits 4-0
|
||||
vpand xtmp2a, x1, xmask0f
|
||||
vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
vpsraw x1, x1, 4
|
||||
vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
vpand x1, x1, xmask0f
|
||||
vpshufb xtmp1b, xgft_hi, x0 ;Lookup mul table of high nibble
|
||||
vpshufb xtmp1c, xgft_lo, xtmp1a ;Lookup mul table of low nibble
|
||||
vpshufb xtmp2b, xgft_hi, x1 ;Lookup mul table of high nibble
|
||||
vpshufb xtmp2c, xgft_lo, xtmp2a ;Lookup mul table of low nibble
|
||||
vpxor xtmp1b, xtmp1b, xtmp1c ;GF add high and low partials
|
||||
vpxor xtmp2b, xtmp2b, xtmp2c
|
||||
XSTR [dest+pos-32], xtmp1b ;Store result
|
||||
XSTR [dest+pos-16], xtmp2b ;Store +16B result
|
||||
jl loop32
|
||||
|
||||
|
||||
return_pass:
|
||||
FUNC_RESTORE
|
||||
sub pos, len
|
||||
ret
|
||||
|
||||
return_fail:
|
||||
FUNC_RESTORE
|
||||
mov return, 1
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
|
||||
mask0f:
|
||||
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_vect_mul_avx, 01, 03, 0036
|
99
erasure_code/gf_vect_mul_avx_perf.c
Normal file
99
erasure_code/gf_vect_mul_avx_perf.c
Normal file
@ -0,0 +1,99 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_LEN 8*1024
|
||||
# define TEST_LOOPS 4000000
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 10
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN GT_L3_CACHE / 2
|
||||
# define TEST_LOOPS 1000
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define TEST_MEM (2 * TEST_LEN)
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i;
|
||||
u8 *buff1, *buff2, gf_const_tbl[64], a = 2;
|
||||
struct perf start, stop;
|
||||
|
||||
printf("gf_vect_mul_avx_perf:\n");
|
||||
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
|
||||
// Allocate large mem region
|
||||
buff1 = (u8 *) malloc(TEST_LEN);
|
||||
buff2 = (u8 *) malloc(TEST_LEN);
|
||||
if (NULL == buff1 || NULL == buff2) {
|
||||
printf("Failed to allocate %dB\n", TEST_LEN);
|
||||
return 1;
|
||||
}
|
||||
|
||||
memset(buff1, 0, TEST_LEN);
|
||||
memset(buff2, 0, TEST_LEN);
|
||||
|
||||
gf_vect_mul_avx(TEST_LEN, gf_const_tbl, buff1, buff2);
|
||||
|
||||
printf("Start timed tests\n");
|
||||
fflush(0);
|
||||
|
||||
gf_vect_mul_avx(TEST_LEN, gf_const_tbl, buff1, buff2);
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
gf_vect_mul_avx(TEST_LEN, gf_const_tbl, buff1, buff2);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("gf_vect_mul_avx" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * i);
|
||||
|
||||
return 0;
|
||||
}
|
143
erasure_code/gf_vect_mul_avx_test.c
Normal file
143
erasure_code/gf_vect_mul_avx_test.c
Normal file
@ -0,0 +1,143 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset
|
||||
#include "erasure_code.h"
|
||||
|
||||
#define TEST_SIZE 8192
|
||||
#define TEST_MEM TEST_SIZE
|
||||
#define TEST_LOOPS 100000
|
||||
#define TEST_TYPE_STR ""
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i;
|
||||
u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
|
||||
int align, size;
|
||||
unsigned char *efence_buff1;
|
||||
unsigned char *efence_buff2;
|
||||
unsigned char *efence_buff3;
|
||||
|
||||
printf("gf_vect_mul_avx:\n");
|
||||
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
|
||||
buff1 = (u8 *) malloc(TEST_SIZE);
|
||||
buff2 = (u8 *) malloc(TEST_SIZE);
|
||||
buff3 = (u8 *) malloc(TEST_SIZE);
|
||||
|
||||
if (NULL == buff1 || NULL == buff2 || NULL == buff3) {
|
||||
printf("buffer alloc error\n");
|
||||
return -1;
|
||||
}
|
||||
// Fill with rand data
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
buff1[i] = rand();
|
||||
|
||||
gf_vect_mul_avx(TEST_SIZE, gf_const_tbl, buff1, buff2);
|
||||
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
if (gf_mul(a, buff1[i]) != buff2[i]) {
|
||||
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i, buff1[i], buff2[i],
|
||||
gf_mul(2, buff1[i]));
|
||||
return 1;
|
||||
}
|
||||
|
||||
gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
|
||||
|
||||
// Check reference function
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
if (buff2[i] != buff3[i]) {
|
||||
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
|
||||
i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
buff1[i] = rand();
|
||||
|
||||
// Check each possible constant
|
||||
printf("Random tests ");
|
||||
for (a = 0; a != 255; a++) {
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
gf_vect_mul_avx(TEST_SIZE, gf_const_tbl, buff1, buff2);
|
||||
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
if (gf_mul(a, buff1[i]) != buff2[i]) {
|
||||
printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
|
||||
i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
|
||||
return 1;
|
||||
}
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
align = 32;
|
||||
a = 2;
|
||||
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
for (size = 0; size < TEST_SIZE; size += align) {
|
||||
// Line up TEST_SIZE from end
|
||||
efence_buff1 = buff1 + size;
|
||||
efence_buff2 = buff2 + size;
|
||||
efence_buff3 = buff3 + size;
|
||||
|
||||
gf_vect_mul_avx(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2);
|
||||
|
||||
for (i = 0; i < TEST_SIZE - size; i++)
|
||||
if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
|
||||
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
|
||||
i, efence_buff1[i], efence_buff2[i], gf_mul(2,
|
||||
efence_buff1
|
||||
[i]));
|
||||
return 1;
|
||||
}
|
||||
|
||||
gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff3);
|
||||
|
||||
// Check reference function
|
||||
for (i = 0; i < TEST_SIZE - size; i++)
|
||||
if (efence_buff2[i] != efence_buff3[i]) {
|
||||
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
|
||||
i, a, efence_buff2[i], efence_buff3[i], gf_mul(2,
|
||||
efence_buff1
|
||||
[i]));
|
||||
return 1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
printf(" done: Pass\n");
|
||||
return 0;
|
||||
}
|
129
erasure_code/gf_vect_mul_base_test.c
Normal file
129
erasure_code/gf_vect_mul_base_test.c
Normal file
@ -0,0 +1,129 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset
|
||||
#include "erasure_code.h"
|
||||
|
||||
#define TEST_SIZE 8192
|
||||
#define TEST_MEM TEST_SIZE
|
||||
#define TEST_LOOPS 100000
|
||||
#define TEST_TYPE_STR ""
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i;
|
||||
u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
|
||||
int align, size;
|
||||
unsigned char *efence_buff1;
|
||||
unsigned char *efence_buff2;
|
||||
|
||||
printf("gf_vect_mul_base_test:\n");
|
||||
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
|
||||
buff1 = (u8 *) malloc(TEST_SIZE);
|
||||
buff2 = (u8 *) malloc(TEST_SIZE);
|
||||
buff3 = (u8 *) malloc(TEST_SIZE);
|
||||
|
||||
if (NULL == buff1 || NULL == buff2 || NULL == buff3) {
|
||||
printf("buffer alloc error\n");
|
||||
return -1;
|
||||
}
|
||||
// Fill with rand data
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
buff1[i] = rand();
|
||||
|
||||
gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2);
|
||||
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
if (gf_mul(a, buff1[i]) != buff2[i]) {
|
||||
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i, buff1[i], buff2[i],
|
||||
gf_mul(2, buff1[i]));
|
||||
return 1;
|
||||
}
|
||||
|
||||
gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
|
||||
|
||||
// Check reference function
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
if (buff2[i] != buff3[i]) {
|
||||
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
|
||||
i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
buff1[i] = rand();
|
||||
|
||||
// Check each possible constant
|
||||
printf("Random tests ");
|
||||
for (a = 0; a != 255; a++) {
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff2);
|
||||
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
if (gf_mul(a, buff1[i]) != buff2[i]) {
|
||||
printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
|
||||
i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
|
||||
return 1;
|
||||
}
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
align = 32;
|
||||
a = 2;
|
||||
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
for (size = 0; size < TEST_SIZE; size += align) {
|
||||
// Line up TEST_SIZE from end
|
||||
efence_buff1 = buff1 + size;
|
||||
efence_buff2 = buff2 + size;
|
||||
|
||||
gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2);
|
||||
|
||||
for (i = 0; i < TEST_SIZE - size; i++)
|
||||
if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
|
||||
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
|
||||
i, efence_buff1[i], efence_buff2[i], gf_mul(2,
|
||||
efence_buff1
|
||||
[i]));
|
||||
return 1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
printf(" done: Pass\n");
|
||||
return 0;
|
||||
}
|
99
erasure_code/gf_vect_mul_perf.c
Normal file
99
erasure_code/gf_vect_mul_perf.c
Normal file
@ -0,0 +1,99 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_LEN 8*1024
|
||||
# define TEST_LOOPS 4000000
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 10
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN GT_L3_CACHE / 2
|
||||
# define TEST_LOOPS 1000
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define TEST_MEM (2 * TEST_LEN)
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i;
|
||||
u8 *buff1, *buff2, gf_const_tbl[64], a = 2;
|
||||
struct perf start, stop;
|
||||
|
||||
printf("gf_vect_mul_perf:\n");
|
||||
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
|
||||
// Allocate large mem region
|
||||
buff1 = (u8 *) malloc(TEST_LEN);
|
||||
buff2 = (u8 *) malloc(TEST_LEN);
|
||||
if (NULL == buff1 || NULL == buff2) {
|
||||
printf("Failed to allocate %dB\n", TEST_LEN);
|
||||
return 1;
|
||||
}
|
||||
|
||||
memset(buff1, 0, TEST_LEN);
|
||||
memset(buff2, 0, TEST_LEN);
|
||||
|
||||
gf_vect_mul(TEST_LEN, gf_const_tbl, buff1, buff2);
|
||||
|
||||
printf("Start timed tests\n");
|
||||
fflush(0);
|
||||
|
||||
gf_vect_mul(TEST_LEN, gf_const_tbl, buff1, buff2);
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
gf_vect_mul(TEST_LEN, gf_const_tbl, buff1, buff2);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("gf_vect_mul" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * i);
|
||||
|
||||
return 0;
|
||||
}
|
170
erasure_code/gf_vect_mul_sse.asm
Normal file
170
erasure_code/gf_vect_mul_sse.asm
Normal file
@ -0,0 +1,170 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;;;
|
||||
;;; gf_vect_mul_sse(len, mul_array, src, dest)
|
||||
;;;
|
||||
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, elf64
|
||||
%define arg0 rdi
|
||||
%define arg1 rsi
|
||||
%define arg2 rdx
|
||||
%define arg3 rcx
|
||||
%define arg4 r8
|
||||
%define arg5 r9
|
||||
%define tmp r11
|
||||
%define return rax
|
||||
%define func(x) x:
|
||||
%define FUNC_SAVE
|
||||
%define FUNC_RESTORE
|
||||
|
||||
%elifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg0 rcx
|
||||
%define arg1 rdx
|
||||
%define arg2 r8
|
||||
%define arg3 r9
|
||||
%define return rax
|
||||
%define stack_size 5*16 + 8 ; must be an odd multiple of 8
|
||||
%define func(x) proc_frame x
|
||||
%macro FUNC_SAVE 0
|
||||
alloc_stack stack_size
|
||||
save_xmm128 xmm6, 0*16
|
||||
save_xmm128 xmm7, 1*16
|
||||
save_xmm128 xmm13, 2*16
|
||||
save_xmm128 xmm14, 3*16
|
||||
save_xmm128 xmm15, 4*16
|
||||
end_prolog
|
||||
%endmacro
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
movdqa xmm6, [rsp + 0*16]
|
||||
movdqa xmm7, [rsp + 1*16]
|
||||
movdqa xmm13, [rsp + 2*16]
|
||||
movdqa xmm14, [rsp + 3*16]
|
||||
movdqa xmm15, [rsp + 4*16]
|
||||
add rsp, stack_size
|
||||
%endmacro
|
||||
|
||||
%endif
|
||||
|
||||
|
||||
%define len arg0
|
||||
%define mul_array arg1
|
||||
%define src arg2
|
||||
%define dest arg3
|
||||
%define pos return
|
||||
|
||||
|
||||
;;; Use Non-temporal load/stor
|
||||
%ifdef NO_NT_LDST
|
||||
%define XLDR movdqa
|
||||
%define XSTR movdqa
|
||||
%else
|
||||
%define XLDR movntdqa
|
||||
%define XSTR movntdq
|
||||
%endif
|
||||
|
||||
default rel
|
||||
|
||||
[bits 64]
|
||||
section .text
|
||||
|
||||
%define xmask0f xmm15
|
||||
%define xgft_lo xmm14
|
||||
%define xgft_hi xmm13
|
||||
|
||||
%define x0 xmm0
|
||||
%define xtmp1a xmm1
|
||||
%define xtmp1b xmm2
|
||||
%define xtmp1c xmm3
|
||||
%define x1 xmm4
|
||||
%define xtmp2a xmm5
|
||||
%define xtmp2b xmm6
|
||||
%define xtmp2c xmm7
|
||||
|
||||
|
||||
align 16
|
||||
global gf_vect_mul_sse:function
|
||||
func(gf_vect_mul_sse)
|
||||
FUNC_SAVE
|
||||
mov pos, 0
|
||||
movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
|
||||
movdqu xgft_lo, [mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
|
||||
movdqu xgft_hi, [mul_array+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
|
||||
|
||||
loop32:
|
||||
XLDR x0, [src+pos] ;Get next source vector
|
||||
XLDR x1, [src+pos+16] ;Get next source vector + 16B ahead
|
||||
movdqa xtmp1b, xgft_hi ;Reload const array registers
|
||||
movdqa xtmp1c, xgft_lo
|
||||
movdqa xtmp2b, xgft_hi
|
||||
movdqa xtmp2c, xgft_lo
|
||||
movdqa xtmp1a, x0 ;Keep unshifted copy of src
|
||||
movdqa xtmp2a, x1
|
||||
psraw x0, 4 ;Shift to put high nibble into bits 4-0
|
||||
psraw x1, 4
|
||||
pand xtmp1a, xmask0f ;Mask low src nibble in bits 4-0
|
||||
pand xtmp2a, xmask0f
|
||||
pand x0, xmask0f ;Mask high src nibble in bits 4-0
|
||||
pand x1, xmask0f
|
||||
pshufb xtmp1b, x0 ;Lookup mul table of high nibble
|
||||
pshufb xtmp1c, xtmp1a ;Lookup mul table of low nibble
|
||||
pshufb xtmp2b, x1
|
||||
pshufb xtmp2c, xtmp2a
|
||||
pxor xtmp1b, xtmp1c ;GF add high and low partials
|
||||
pxor xtmp2b, xtmp2c
|
||||
XSTR [dest+pos], xtmp1b ;Store result
|
||||
XSTR [dest+pos+16], xtmp2b ;Store +16B result
|
||||
add pos, 32 ;Loop on 32 bytes at at time
|
||||
cmp pos, len
|
||||
jl loop32
|
||||
|
||||
|
||||
return_pass:
|
||||
sub pos, len
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
return_fail:
|
||||
mov return, 1
|
||||
FUNC_RESTORE
|
||||
ret
|
||||
|
||||
endproc_frame
|
||||
|
||||
section .data
|
||||
|
||||
align 16
|
||||
mask0f:
|
||||
ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
||||
|
||||
;;; func core, ver, snum
|
||||
slversion gf_vect_mul_sse, 00, 03, 0034
|
97
erasure_code/gf_vect_mul_sse_perf.c
Normal file
97
erasure_code/gf_vect_mul_sse_perf.c
Normal file
@ -0,0 +1,97 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset
|
||||
#include "erasure_code.h"
|
||||
#include "test.h"
|
||||
|
||||
//#define CACHED_TEST
|
||||
#ifdef CACHED_TEST
|
||||
// Cached test, loop many times over small dataset
|
||||
# define TEST_LEN 8*1024
|
||||
# define TEST_LOOPS 4000000
|
||||
# define TEST_TYPE_STR "_warm"
|
||||
#else
|
||||
# ifndef TEST_CUSTOM
|
||||
// Uncached test. Pull from large mem base.
|
||||
# define TEST_SOURCES 10
|
||||
# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
|
||||
# define TEST_LEN GT_L3_CACHE / 2
|
||||
# define TEST_LOOPS 1000
|
||||
# define TEST_TYPE_STR "_cold"
|
||||
# else
|
||||
# define TEST_TYPE_STR "_cus"
|
||||
# ifndef TEST_LOOPS
|
||||
# define TEST_LOOPS 1000
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define TEST_MEM (2 * TEST_LEN)
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i;
|
||||
u8 *buff1, *buff2, gf_const_tbl[64], a = 2;
|
||||
struct perf start, stop;
|
||||
|
||||
printf("gf_vect_mul_sse_perf:\n");
|
||||
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
|
||||
// Allocate large mem region
|
||||
buff1 = (u8 *) malloc(TEST_LEN);
|
||||
buff2 = (u8 *) malloc(TEST_LEN);
|
||||
if (NULL == buff1 || NULL == buff2) {
|
||||
printf("Failed to allocate %dB\n", TEST_LEN);
|
||||
return 1;
|
||||
}
|
||||
|
||||
memset(buff1, 0, TEST_LEN);
|
||||
memset(buff2, 0, TEST_LEN);
|
||||
|
||||
printf("Start timed tests\n");
|
||||
fflush(0);
|
||||
|
||||
gf_vect_mul_sse(TEST_LEN, gf_const_tbl, buff1, buff2);
|
||||
perf_start(&start);
|
||||
for (i = 0; i < TEST_LOOPS; i++) {
|
||||
gf_vect_mul_init(a, gf_const_tbl); // in a re-build would only calc once
|
||||
gf_vect_mul_sse(TEST_LEN, gf_const_tbl, buff1, buff2);
|
||||
}
|
||||
perf_stop(&stop);
|
||||
printf("gf_vect_mul_sse" TEST_TYPE_STR ": ");
|
||||
perf_print(stop, start, (long long)TEST_LEN * i);
|
||||
|
||||
return 0;
|
||||
}
|
160
erasure_code/gf_vect_mul_sse_test.c
Normal file
160
erasure_code/gf_vect_mul_sse_test.c
Normal file
@ -0,0 +1,160 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "erasure_code.h"
|
||||
|
||||
#define TEST_SIZE (128*1024)
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i;
|
||||
u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
|
||||
int tsize;
|
||||
int align, size;
|
||||
unsigned char *efence_buff1;
|
||||
unsigned char *efence_buff2;
|
||||
unsigned char *efence_buff3;
|
||||
|
||||
printf("gf_vect_mul_sse_test: ");
|
||||
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
|
||||
buff1 = (u8 *) malloc(TEST_SIZE);
|
||||
buff2 = (u8 *) malloc(TEST_SIZE);
|
||||
buff3 = (u8 *) malloc(TEST_SIZE);
|
||||
|
||||
if (NULL == buff1 || NULL == buff2 || NULL == buff3) {
|
||||
printf("buffer alloc error\n");
|
||||
return -1;
|
||||
}
|
||||
// Fill with rand data
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
buff1[i] = rand();
|
||||
|
||||
gf_vect_mul_sse(TEST_SIZE, gf_const_tbl, buff1, buff2);
|
||||
|
||||
for (i = 0; i < TEST_SIZE; i++) {
|
||||
if (gf_mul(a, buff1[i]) != buff2[i]) {
|
||||
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i,
|
||||
buff1[i], buff2[i], gf_mul(2, buff1[i]));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
|
||||
|
||||
// Check reference function
|
||||
for (i = 0; i < TEST_SIZE; i++) {
|
||||
if (buff2[i] != buff3[i]) {
|
||||
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
|
||||
i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
buff1[i] = rand();
|
||||
|
||||
// Check each possible constant
|
||||
for (a = 0; a != 255; a++) {
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
gf_vect_mul_sse(TEST_SIZE, gf_const_tbl, buff1, buff2);
|
||||
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
if (gf_mul(a, buff1[i]) != buff2[i]) {
|
||||
printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
|
||||
i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
|
||||
return -1;
|
||||
}
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Check buffer len
|
||||
for (tsize = TEST_SIZE; tsize > 0; tsize -= 32) {
|
||||
a = rand();
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
gf_vect_mul_sse(tsize, gf_const_tbl, buff1, buff2);
|
||||
|
||||
for (i = 0; i < tsize; i++)
|
||||
if (gf_mul(a, buff1[i]) != buff2[i]) {
|
||||
printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
|
||||
i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
|
||||
return -1;
|
||||
}
|
||||
if (0 == tsize % (32 * 8)) {
|
||||
putchar('.');
|
||||
fflush(0);
|
||||
}
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
align = 32;
|
||||
a = 2;
|
||||
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
for (size = 0; size < TEST_SIZE; size += align) {
|
||||
// Line up TEST_SIZE from end
|
||||
efence_buff1 = buff1 + size;
|
||||
efence_buff2 = buff2 + size;
|
||||
efence_buff3 = buff3 + size;
|
||||
|
||||
gf_vect_mul_sse(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2);
|
||||
|
||||
for (i = 0; i < TEST_SIZE - size; i++)
|
||||
if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
|
||||
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
|
||||
i, efence_buff1[i], efence_buff2[i], gf_mul(2,
|
||||
efence_buff1
|
||||
[i]));
|
||||
return 1;
|
||||
}
|
||||
|
||||
gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff3);
|
||||
|
||||
// Check reference function
|
||||
for (i = 0; i < TEST_SIZE - size; i++)
|
||||
if (efence_buff2[i] != efence_buff3[i]) {
|
||||
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
|
||||
i, a, efence_buff2[i], efence_buff3[i], gf_mul(2,
|
||||
efence_buff1
|
||||
[i]));
|
||||
return 1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
printf(" done: Pass\n");
|
||||
fflush(0);
|
||||
return 0;
|
||||
}
|
142
erasure_code/gf_vect_mul_test.c
Normal file
142
erasure_code/gf_vect_mul_test.c
Normal file
@ -0,0 +1,142 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h> // for memset
|
||||
#include "erasure_code.h"
|
||||
|
||||
#define TEST_SIZE 8192
|
||||
#define TEST_MEM TEST_SIZE
|
||||
#define TEST_LOOPS 100000
|
||||
#define TEST_TYPE_STR ""
|
||||
|
||||
typedef unsigned char u8;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i;
|
||||
u8 *buff1, *buff2, *buff3, gf_const_tbl[64], a = 2;
|
||||
int align, size;
|
||||
unsigned char *efence_buff1;
|
||||
unsigned char *efence_buff2;
|
||||
unsigned char *efence_buff3;
|
||||
|
||||
printf("gf_vect_mul_test:\n");
|
||||
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
|
||||
buff1 = (u8 *) malloc(TEST_SIZE);
|
||||
buff2 = (u8 *) malloc(TEST_SIZE);
|
||||
buff3 = (u8 *) malloc(TEST_SIZE);
|
||||
|
||||
if (NULL == buff1 || NULL == buff2 || NULL == buff3) {
|
||||
printf("buffer alloc error\n");
|
||||
return -1;
|
||||
}
|
||||
// Fill with rand data
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
buff1[i] = rand();
|
||||
|
||||
gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2);
|
||||
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
if (gf_mul(a, buff1[i]) != buff2[i]) {
|
||||
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n", i, buff1[i], buff2[i],
|
||||
gf_mul(2, buff1[i]));
|
||||
return 1;
|
||||
}
|
||||
|
||||
gf_vect_mul_base(TEST_SIZE, gf_const_tbl, buff1, buff3);
|
||||
|
||||
// Check reference function
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
if (buff2[i] != buff3[i]) {
|
||||
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
|
||||
i, a, buff1[i], buff2[i], gf_mul(a, buff1[i]));
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < TEST_SIZE; i++)
|
||||
buff1[i] = rand();
|
||||
|
||||
// Check each possible constant
|
||||
printf("Random tests ");
|
||||
for (a = 0; a != 255; a++) {
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
gf_vect_mul(TEST_SIZE, gf_const_tbl, buff1, buff2);
|
||||
|
||||
for (i = 0; i < TEST_SIZE; i++) {
|
||||
if (gf_mul(a, buff1[i]) != buff2[i]) {
|
||||
printf("fail at %d, 0x%x x %d = 0x%x (0x%x)\n",
|
||||
i, a, buff1[i], buff2[i], gf_mul(2, buff1[i]));
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
// Run tests at end of buffer for Electric Fence
|
||||
align = 32;
|
||||
a = 2;
|
||||
|
||||
gf_vect_mul_init(a, gf_const_tbl);
|
||||
for (size = 0; size < TEST_SIZE; size += align) {
|
||||
// Line up TEST_SIZE from end
|
||||
efence_buff1 = buff1 + size;
|
||||
efence_buff2 = buff2 + size;
|
||||
efence_buff3 = buff3 + size;
|
||||
|
||||
gf_vect_mul(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff2);
|
||||
|
||||
for (i = 0; i < TEST_SIZE - size; i++)
|
||||
if (gf_mul(a, efence_buff1[i]) != efence_buff2[i]) {
|
||||
printf("fail at %d, 0x%x x 2 = 0x%x (0x%x)\n",
|
||||
i, efence_buff1[i], efence_buff2[i],
|
||||
gf_mul(2, efence_buff1[i]));
|
||||
return 1;
|
||||
}
|
||||
|
||||
gf_vect_mul_base(TEST_SIZE - size, gf_const_tbl, efence_buff1, efence_buff3);
|
||||
|
||||
// Check reference function
|
||||
for (i = 0; i < TEST_SIZE - size; i++)
|
||||
if (efence_buff2[i] != efence_buff3[i]) {
|
||||
printf("fail at %d, 0x%x x 0x%d = 0x%x (0x%x)\n",
|
||||
i, a, efence_buff2[i], efence_buff3[i],
|
||||
gf_mul(2, efence_buff1[i]));
|
||||
return 1;
|
||||
}
|
||||
|
||||
putchar('.');
|
||||
}
|
||||
|
||||
printf(" done: Pass\n");
|
||||
return 0;
|
||||
}
|
933
include/erasure_code.h
Normal file
933
include/erasure_code.h
Normal file
@ -0,0 +1,933 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
|
||||
#ifndef _ERASURE_CODE_H_
|
||||
#define _ERASURE_CODE_H_
|
||||
|
||||
/**
|
||||
* @file erasure_code.h
|
||||
* @brief Interface to functions supporting erasure code encode and decode.
|
||||
*
|
||||
* This file defines the interface to optimized functions used in erasure
|
||||
* codes. Encode and decode of erasures in GF(2^8) are made by calculating the
|
||||
* dot product of the symbols (bytes in GF(2^8)) across a set of buffers and a
|
||||
* set of coefficients. Values for the coefficients are determined by the type
|
||||
* of erasure code. Using a general dot product means that any sequence of
|
||||
* coefficients may be used including erasure codes based on random
|
||||
* coefficients.
|
||||
* Multiple versions of dot product are supplied to calculate 1-6 output
|
||||
* vectors in one pass.
|
||||
* Base GF multiply and divide functions can be sped up by defining
|
||||
* GF_LARGE_TABLES at the expense of memory size.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "gf_vect_mul.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Initialize tables for fast Erasure Code encode and decode.
|
||||
*
|
||||
* Generates the expanded tables needed for fast encode or decode for erasure
|
||||
* codes on blocks of data. 32bytes is generated for each input coefficient.
|
||||
*
|
||||
* @param k The number of vector sources or rows in the generator matrix
|
||||
* for coding.
|
||||
* @param rows The number of output vectors to concurrently encode/decode.
|
||||
* @param a Pointer to sets of arrays of input coefficients used to encode
|
||||
* or decode data.
|
||||
* @param gftbls Pointer to start of space for concatenated output tables
|
||||
* generated from input coefficients. Must be of size 32*k*rows.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void ec_init_tables(int k, int rows, unsigned char* a, unsigned char* gftbls);
|
||||
|
||||
/**
|
||||
* @brief Generate or decode erasure codes on blocks of data, runs appropriate version.
|
||||
*
|
||||
* Given a list of source data blocks, generate one or multiple blocks of
|
||||
* encoded data as specified by a matrix of GF(2^8) coefficients. When given a
|
||||
* suitable set of coefficients, this function will perform the fast generation
|
||||
* or decoding of Reed-Solomon type erasure codes.
|
||||
*
|
||||
* This function determines what instruction sets are enabled and
|
||||
* selects the appropriate version at runtime.
|
||||
*
|
||||
* @param len Length of each block of data (vector) of source or dest data.
|
||||
* @param k The number of vector sources or rows in the generator matrix
|
||||
* for coding.
|
||||
* @param rows The number of output vectors to concurrently encode/decode.
|
||||
* @param gftbls Pointer to array of input tables generated from coding
|
||||
* coefficients in ec_init_tables(). Must be of size 32*k*rows
|
||||
* @param data Array of pointers to source input buffers.
|
||||
* @param coding Array of pointers to coded output buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
|
||||
unsigned char **coding);
|
||||
|
||||
/**
|
||||
* @brief Generate or decode erasure codes on blocks of data.
|
||||
*
|
||||
* Arch specific version of ec_encode_data() with same parameters.
|
||||
* @requires SSE4.1
|
||||
*/
|
||||
void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
|
||||
unsigned char **coding);
|
||||
|
||||
/**
|
||||
* @brief Generate or decode erasure codes on blocks of data.
|
||||
*
|
||||
* Arch specific version of ec_encode_data() with same parameters.
|
||||
* @requires AVX
|
||||
*/
|
||||
void ec_encode_data_avx(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
|
||||
unsigned char **coding);
|
||||
|
||||
/**
|
||||
* @brief Generate or decode erasure codes on blocks of data.
|
||||
*
|
||||
* Arch specific version of ec_encode_data() with same parameters.
|
||||
* @requires AVX2
|
||||
*/
|
||||
void ec_encode_data_avx2(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
|
||||
unsigned char **coding);
|
||||
|
||||
/**
|
||||
* @brief Generate or decode erasure codes on blocks of data, runs baseline version.
|
||||
*
|
||||
* Baseline version of ec_encode_data() with same parameters.
|
||||
*/
|
||||
void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src,
|
||||
unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief Generate update for encode or decode of erasure codes from single source, runs appropriate version.
|
||||
*
|
||||
* Given one source data block, update one or multiple blocks of encoded data as
|
||||
* specified by a matrix of GF(2^8) coefficients. When given a suitable set of
|
||||
* coefficients, this function will perform the fast generation or decoding of
|
||||
* Reed-Solomon type erasure codes from one input source at a time.
|
||||
*
|
||||
* This function determines what instruction sets are enabled and selects the
|
||||
* appropriate version at runtime.
|
||||
*
|
||||
* @param len Length of each block of data (vector) of source or dest data.
|
||||
* @param k The number of vector sources or rows in the generator matrix
|
||||
* for coding.
|
||||
* @param rows The number of output vectors to concurrently encode/decode.
|
||||
* @param vec_i The vector index corresponding to the single input source.
|
||||
* @param g_tbls Pointer to array of input tables generated from coding
|
||||
* coefficients in ec_init_tables(). Must be of size 32*k*rows
|
||||
* @param data Pointer to single input source used to update output parity.
|
||||
* @param coding Array of pointers to coded output buffers.
|
||||
* @returns none
|
||||
*/
|
||||
void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
|
||||
unsigned char *data, unsigned char **coding);
|
||||
|
||||
/**
|
||||
* @brief Generate update for encode or decode of erasure codes from single source.
|
||||
*
|
||||
* Arch specific version of ec_encode_data_update() with same parameters.
|
||||
* @requires SSE4.1
|
||||
*/
|
||||
|
||||
void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
|
||||
unsigned char *data, unsigned char **coding);
|
||||
|
||||
/**
|
||||
* @brief Generate update for encode or decode of erasure codes from single source.
|
||||
*
|
||||
* Arch specific version of ec_encode_data_update() with same parameters.
|
||||
* @requires AVX
|
||||
*/
|
||||
|
||||
void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
|
||||
unsigned char *data, unsigned char **coding);
|
||||
|
||||
/**
|
||||
* @brief Generate update for encode or decode of erasure codes from single source.
|
||||
*
|
||||
* Arch specific version of ec_encode_data_update() with same parameters.
|
||||
* @requires AVX2
|
||||
*/
|
||||
|
||||
void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
|
||||
unsigned char *data, unsigned char **coding);
|
||||
|
||||
/**
|
||||
* @brief Generate update for encode or decode of erasure codes from single source.
|
||||
*
|
||||
* Baseline version of ec_encode_data_update().
|
||||
*/
|
||||
|
||||
void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v,
|
||||
unsigned char *data, unsigned char **dest);
|
||||
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product.
|
||||
*
|
||||
* Does a GF(2^8) dot product across each byte of the input array and a constant
|
||||
* set of coefficients to produce each byte of the output. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 32*vlen byte constant array based on the input coefficients.
|
||||
* @requires SSE4.1
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 16.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
|
||||
* on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Pointer to destination data array.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char *dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product.
|
||||
*
|
||||
* Does a GF(2^8) dot product across each byte of the input array and a constant
|
||||
* set of coefficients to produce each byte of the output. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 32*vlen byte constant array based on the input coefficients.
|
||||
* @requires AVX
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 16.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
|
||||
* on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Pointer to destination data array.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char *dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product.
|
||||
*
|
||||
* Does a GF(2^8) dot product across each byte of the input array and a constant
|
||||
* set of coefficients to produce each byte of the output. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 32*vlen byte constant array based on the input coefficients.
|
||||
* @requires AVX2
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 32.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
|
||||
* on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Pointer to destination data array.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char *dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with two outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate two ouputs at a time. Does two
|
||||
* GF(2^8) dot products across each byte of the input array and two constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 2*32*vlen byte constant array based on the two sets of input coefficients.
|
||||
* @requires SSE4.1
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 16.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_2vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with two outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate two ouputs at a time. Does two
|
||||
* GF(2^8) dot products across each byte of the input array and two constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 2*32*vlen byte constant array based on the two sets of input coefficients.
|
||||
* @requires AVX
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 16.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_2vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with two outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate two ouputs at a time. Does two
|
||||
* GF(2^8) dot products across each byte of the input array and two constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 2*32*vlen byte constant array based on the two sets of input coefficients.
|
||||
* @requires AVX2
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 32.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_2vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with three outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate three ouputs at a time. Does three
|
||||
* GF(2^8) dot products across each byte of the input array and three constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 3*32*vlen byte constant array based on the three sets of input coefficients.
|
||||
* @requires SSE4.1
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 16.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_3vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with three outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate three ouputs at a time. Does three
|
||||
* GF(2^8) dot products across each byte of the input array and three constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 3*32*vlen byte constant array based on the three sets of input coefficients.
|
||||
* @requires AVX
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 16.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_3vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with three outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate three ouputs at a time. Does three
|
||||
* GF(2^8) dot products across each byte of the input array and three constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 3*32*vlen byte constant array based on the three sets of input coefficients.
|
||||
* @requires AVX2
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 32.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_3vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with four outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate four ouputs at a time. Does four
|
||||
* GF(2^8) dot products across each byte of the input array and four constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 4*32*vlen byte constant array based on the four sets of input coefficients.
|
||||
* @requires SSE4.1
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 16.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_4vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with four outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate four ouputs at a time. Does four
|
||||
* GF(2^8) dot products across each byte of the input array and four constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 4*32*vlen byte constant array based on the four sets of input coefficients.
|
||||
* @requires AVX
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 16.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_4vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with four outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate four ouputs at a time. Does four
|
||||
* GF(2^8) dot products across each byte of the input array and four constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 4*32*vlen byte constant array based on the four sets of input coefficients.
|
||||
* @requires AVX2
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 32.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_4vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with five outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate five ouputs at a time. Does five
|
||||
* GF(2^8) dot products across each byte of the input array and five constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 5*32*vlen byte constant array based on the five sets of input coefficients.
|
||||
* @requires SSE4.1
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must >= 16.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_5vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with five outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate five ouputs at a time. Does five
|
||||
* GF(2^8) dot products across each byte of the input array and five constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 5*32*vlen byte constant array based on the five sets of input coefficients.
|
||||
* @requires AVX
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must >= 16.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_5vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with five outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate five ouputs at a time. Does five
|
||||
* GF(2^8) dot products across each byte of the input array and five constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 5*32*vlen byte constant array based on the five sets of input coefficients.
|
||||
* @requires AVX2
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must >= 32.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_5vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with six outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate six ouputs at a time. Does six
|
||||
* GF(2^8) dot products across each byte of the input array and six constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 6*32*vlen byte constant array based on the six sets of input coefficients.
|
||||
* @requires SSE4.1
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 16.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_6vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with six outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate six ouputs at a time. Does six
|
||||
* GF(2^8) dot products across each byte of the input array and six constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 6*32*vlen byte constant array based on the six sets of input coefficients.
|
||||
* @requires AVX
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 16.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_6vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product with six outputs.
|
||||
*
|
||||
* Vector dot product optimized to calculate six ouputs at a time. Does six
|
||||
* GF(2^8) dot products across each byte of the input array and six constant
|
||||
* sets of coefficients to produce each byte of the outputs. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 6*32*vlen byte constant array based on the six sets of input coefficients.
|
||||
* @requires AVX2
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 32.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
|
||||
* based on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Array of pointers to destination data buffers.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_6vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product, runs baseline version.
|
||||
*
|
||||
* Does a GF(2^8) dot product across each byte of the input array and a constant
|
||||
* set of coefficients to produce each byte of the output. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 32*vlen byte constant array based on the input coefficients.
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 16.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
|
||||
* on the array of input coefficients. Only elements 32*CONST*j + 1
|
||||
* of this array are used, where j = (0, 1, 2...) and CONST is the
|
||||
* number of elements in the array of input coefficients. The
|
||||
* elements used correspond to the original input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Pointer to destination data array.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_vect_dot_prod_base(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char *dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector dot product, runs appropriate version.
|
||||
*
|
||||
* Does a GF(2^8) dot product across each byte of the input array and a constant
|
||||
* set of coefficients to produce each byte of the output. Can be used for
|
||||
* erasure coding encode and decode. Function requires pre-calculation of a
|
||||
* 32*vlen byte constant array based on the input coefficients.
|
||||
*
|
||||
* This function determines what instruction sets are enabled and
|
||||
* selects the appropriate version at runtime.
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 32.
|
||||
* @param vlen Number of vector sources.
|
||||
* @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
|
||||
* on the array of input coefficients.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Pointer to destination data array.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_vect_dot_prod(int len, int vlen, unsigned char *gftbls,
|
||||
unsigned char **src, unsigned char *dest);
|
||||
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply accumulate, runs appropriate version.
|
||||
*
|
||||
* Does a GF(2^8) multiply across each byte of input source with expanded
|
||||
* constant and add to destination array. Can be used for erasure coding encode
|
||||
* and decode update when only one source is available at a time. Function
|
||||
* requires pre-calculation of a 32*vec byte constant array based on the input
|
||||
* coefficients.
|
||||
*
|
||||
* This function determines what instruction sets are enabled and selects the
|
||||
* appropriate version at runtime.
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 32.
|
||||
* @param vec The number of vector sources or rows in the generator matrix
|
||||
* for coding.
|
||||
* @param vec_i The vector index corresponding to the single input source.
|
||||
* @param gftbls Pointer to array of input tables generated from coding
|
||||
* coefficients in ec_init_tables(). Must be of size 32*vec.
|
||||
* @param src Array of pointers to source inputs.
|
||||
* @param dest Pointer to destination data array.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_vect_mad(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char *dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply accumulate, arch specific version.
|
||||
*
|
||||
* Arch specific version of gf_vect_mad() with same parameters.
|
||||
* @requires SSE4.1
|
||||
*/
|
||||
|
||||
void gf_vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char *dest);
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply accumulate, arch specific version.
|
||||
*
|
||||
* Arch specific version of gf_vect_mad() with same parameters.
|
||||
* @requires AVX
|
||||
*/
|
||||
|
||||
void gf_vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char *dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply accumulate, arch specific version.
|
||||
*
|
||||
* Arch specific version of gf_vect_mad() with same parameters.
|
||||
* @requires AVX2
|
||||
*/
|
||||
|
||||
void gf_vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char *dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply accumulate, baseline version.
|
||||
*
|
||||
* Baseline version of gf_vect_mad() with same parameters.
|
||||
*/
|
||||
|
||||
void gf_vect_mad_base(int len, int vec, int vec_i, unsigned char *v, unsigned char *src,
|
||||
unsigned char *dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 2 accumulate. SSE version.
|
||||
*
|
||||
* Does a GF(2^8) multiply across each byte of input source with expanded
|
||||
* constants and add to destination arrays. Can be used for erasure coding
|
||||
* encode and decode update when only one source is available at a
|
||||
* time. Function requires pre-calculation of a 32*vec byte constant array based
|
||||
* on the input coefficients.
|
||||
* @requires SSE4.1
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 32.
|
||||
* @param vec The number of vector sources or rows in the generator matrix
|
||||
* for coding.
|
||||
* @param vec_i The vector index corresponding to the single input source.
|
||||
* @param gftbls Pointer to array of input tables generated from coding
|
||||
* coefficients in ec_init_tables(). Must be of size 32*vec.
|
||||
* @param src Pointer to source input array.
|
||||
* @param dest Array of pointers to destination input/outputs.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_2vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 2 accumulate. AVX version of gf_2vect_mad_sse().
|
||||
* @requires AVX
|
||||
*/
|
||||
void gf_2vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 2 accumulate. AVX2 version of gf_2vect_mad_sse().
|
||||
* @requires AVX2
|
||||
*/
|
||||
void gf_2vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 3 accumulate. SSE version.
|
||||
*
|
||||
* Does a GF(2^8) multiply across each byte of input source with expanded
|
||||
* constants and add to destination arrays. Can be used for erasure coding
|
||||
* encode and decode update when only one source is available at a
|
||||
* time. Function requires pre-calculation of a 32*vec byte constant array based
|
||||
* on the input coefficients.
|
||||
* @requires SSE4.1
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 32.
|
||||
* @param vec The number of vector sources or rows in the generator matrix
|
||||
* for coding.
|
||||
* @param vec_i The vector index corresponding to the single input source.
|
||||
* @param gftbls Pointer to array of input tables generated from coding
|
||||
* coefficients in ec_init_tables(). Must be of size 32*vec.
|
||||
* @param src Pointer to source input array.
|
||||
* @param dest Array of pointers to destination input/outputs.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_3vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 3 accumulate. AVX version of gf_3vect_mad_sse().
|
||||
* @requires AVX
|
||||
*/
|
||||
void gf_3vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 3 accumulate. AVX2 version of gf_3vect_mad_sse().
|
||||
* @requires AVX2
|
||||
*/
|
||||
void gf_3vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 4 accumulate. SSE version.
|
||||
*
|
||||
* Does a GF(2^8) multiply across each byte of input source with expanded
|
||||
* constants and add to destination arrays. Can be used for erasure coding
|
||||
* encode and decode update when only one source is available at a
|
||||
* time. Function requires pre-calculation of a 32*vec byte constant array based
|
||||
* on the input coefficients.
|
||||
* @requires SSE4.1
|
||||
*
|
||||
* @param len Length of each vector in bytes. Must be >= 32.
|
||||
* @param vec The number of vector sources or rows in the generator matrix
|
||||
* for coding.
|
||||
* @param vec_i The vector index corresponding to the single input source.
|
||||
* @param gftbls Pointer to array of input tables generated from coding
|
||||
* coefficients in ec_init_tables(). Must be of size 32*vec.
|
||||
* @param src Pointer to source input array.
|
||||
* @param dest Array of pointers to destination input/outputs.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_4vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 4 accumulate. AVX version of gf_4vect_mad_sse().
|
||||
* @requires AVX
|
||||
*/
|
||||
void gf_4vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 4 accumulate. AVX2 version of gf_4vect_mad_sse().
|
||||
* @requires AVX2
|
||||
*/
|
||||
void gf_4vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 5 accumulate. SSE version.
|
||||
* @requires SSE4.1
|
||||
*/
|
||||
void gf_5vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 5 accumulate. AVX version.
|
||||
* @requires AVX
|
||||
*/
|
||||
void gf_5vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 5 accumulate. AVX2 version.
|
||||
* @requires AVX2
|
||||
*/
|
||||
void gf_5vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 6 accumulate. SSE version.
|
||||
* @requires SSE4.1
|
||||
*/
|
||||
void gf_6vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 6 accumulate. AVX version.
|
||||
* @requires AVX
|
||||
*/
|
||||
void gf_6vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply with 6 accumulate. AVX2 version.
|
||||
* @requires AVX2
|
||||
*/
|
||||
void gf_6vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
|
||||
unsigned char **dest);
|
||||
|
||||
|
||||
/**********************************************************************
|
||||
* The remaining are lib support functions used in GF(2^8) operations.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Single element GF(2^8) multiply.
|
||||
*
|
||||
* @param a Multiplicand a
|
||||
* @param b Multiplicand b
|
||||
* @returns Product of a and b in GF(2^8)
|
||||
*/
|
||||
|
||||
unsigned char gf_mul(unsigned char a, unsigned char b);
|
||||
|
||||
/**
|
||||
* @brief Single element GF(2^8) inverse.
|
||||
*
|
||||
* @param a Input element
|
||||
* @returns Field element b such that a x b = {1}
|
||||
*/
|
||||
|
||||
unsigned char gf_inv(unsigned char a);
|
||||
|
||||
/**
|
||||
* @brief Generate a matrix of coefficients to be used for encoding.
|
||||
*
|
||||
* Vandermonde matrix example of encoding coefficients where high portion of
|
||||
* matrix is identity matrix I and lower portion is constructed as 2^{i*(j-k+1)}
|
||||
* i:{0,k-1} j:{k,m-1}. Commonly used method for choosing coefficients in
|
||||
* erasure encoding but does not guarantee invertable for every sub matrix. For
|
||||
* large k it is possible to find cases where the decode matrix chosen from
|
||||
* sources and parity not in erasure are not invertable. Users may want to
|
||||
* adjust for k > 5.
|
||||
*
|
||||
* @param a [mxk] array to hold coefficients
|
||||
* @param m number of rows in matrix corresponding to srcs + parity.
|
||||
* @param k number of columns in matrix corresponding to srcs.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_gen_rs_matrix(unsigned char *a, int m, int k);
|
||||
|
||||
/**
|
||||
* @brief Generate a Cauchy matrix of coefficients to be used for encoding.
|
||||
*
|
||||
* Cauchy matrix example of encoding coefficients where high portion of matrix
|
||||
* is identity matrix I and lower portion is constructed as 1/(i + j) | i != j,
|
||||
* i:{0,k-1} j:{k,m-1}. Any sub-matrix of a Cauchy matrix should be invertable.
|
||||
*
|
||||
* @param a [mxk] array to hold coefficients
|
||||
* @param m number of rows in matrix corresponding to srcs + parity.
|
||||
* @param k number of columns in matrix corresponding to srcs.
|
||||
* @returns none
|
||||
*/
|
||||
|
||||
void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k);
|
||||
|
||||
/**
|
||||
* @brief Invert a matrix in GF(2^8)
|
||||
*
|
||||
* @param in input matrix
|
||||
* @param out output matrix such that [in] x [out] = [I] - identity matrix
|
||||
* @param n size of matrix [nxn]
|
||||
* @returns 0 successful, other fail on singular input matrix
|
||||
*/
|
||||
|
||||
int gf_invert_matrix(unsigned char *in, unsigned char *out, const int n);
|
||||
|
||||
|
||||
/*************************************************************/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ERASURE_CODE_H_
|
148
include/gf_vect_mul.h
Normal file
148
include/gf_vect_mul.h
Normal file
@ -0,0 +1,148 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
|
||||
#ifndef _GF_VECT_MUL_H
|
||||
#define _GF_VECT_MUL_H
|
||||
|
||||
/**
|
||||
* @file gf_vect_mul.h
|
||||
* @brief Interface to functions for vector (block) multiplication in GF(2^8).
|
||||
*
|
||||
* This file defines the interface to routines used in fast RAID rebuild and
|
||||
* erasure codes.
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply by constant.
|
||||
*
|
||||
* Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
|
||||
* is a single field element in GF(2^8). Can be used for RAID6 rebuild
|
||||
* and partial write functions. Function requires pre-calculation of a
|
||||
* 32-element constant array based on constant C. gftbl(C) = {C{00},
|
||||
* C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
|
||||
* and src must be aligned to 32B.
|
||||
* @requires SSE4.1
|
||||
*
|
||||
* @param len Length of vector in bytes. Must be aligned to 32B.
|
||||
* @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
|
||||
* @param src Pointer to src data array. Must be aligned to 32B.
|
||||
* @param dest Pointer to destination data array. Must be aligned to 32B.
|
||||
* @returns 0 pass, other fail
|
||||
*/
|
||||
|
||||
int gf_vect_mul_sse(int len, unsigned char *gftbl, void *src, void *dest);
|
||||
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply by constant.
|
||||
*
|
||||
* Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
|
||||
* is a single field element in GF(2^8). Can be used for RAID6 rebuild
|
||||
* and partial write functions. Function requires pre-calculation of a
|
||||
* 32-element constant array based on constant C. gftbl(C) = {C{00},
|
||||
* C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
|
||||
* and src must be aligned to 32B.
|
||||
* @requires AVX
|
||||
*
|
||||
* @param len Length of vector in bytes. Must be aligned to 32B.
|
||||
* @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
|
||||
* @param src Pointer to src data array. Must be aligned to 32B.
|
||||
* @param dest Pointer to destination data array. Must be aligned to 32B.
|
||||
* @returns 0 pass, other fail
|
||||
*/
|
||||
|
||||
int gf_vect_mul_avx(int len, unsigned char *gftbl, void *src, void *dest);
|
||||
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply by constant, runs appropriate version.
|
||||
*
|
||||
* Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
|
||||
* is a single field element in GF(2^8). Can be used for RAID6 rebuild
|
||||
* and partial write functions. Function requires pre-calculation of a
|
||||
* 32-element constant array based on constant C. gftbl(C) = {C{00},
|
||||
* C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }.
|
||||
* Len and src must be aligned to 32B.
|
||||
*
|
||||
* This function determines what instruction sets are enabled
|
||||
* and selects the appropriate version at runtime.
|
||||
*
|
||||
* @param len Length of vector in bytes. Must be aligned to 32B.
|
||||
* @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
|
||||
* @param src Pointer to src data array. Must be aligned to 32B.
|
||||
* @param dest Pointer to destination data array. Must be aligned to 32B.
|
||||
* @returns 0 pass, other fail
|
||||
*/
|
||||
|
||||
int gf_vect_mul(int len, unsigned char *gftbl, void *src, void *dest);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Initialize 32-byte constant array for GF(2^8) vector multiply
|
||||
*
|
||||
* Calculates array {C{00}, C{01}, C{02}, ... , C{0f} }, {C{00}, C{10},
|
||||
* C{20}, ... , C{f0} } as required by other fast vector multiply
|
||||
* functions.
|
||||
* @param c Constant input.
|
||||
* @param gftbl Table output.
|
||||
*/
|
||||
|
||||
void gf_vect_mul_init(unsigned char c, unsigned char* gftbl);
|
||||
|
||||
|
||||
/**
|
||||
* @brief GF(2^8) vector multiply by constant, runs baseline version.
|
||||
*
|
||||
* Does a GF(2^8) vector multiply b = Ca where a and b are arrays and C
|
||||
* is a single field element in GF(2^8). Can be used for RAID6 rebuild
|
||||
* and partial write functions. Function requires pre-calculation of a
|
||||
* 32-element constant array based on constant C. gftbl(C) = {C{00},
|
||||
* C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
|
||||
* and src must be aligned to 32B.
|
||||
*
|
||||
* @param len Length of vector in bytes. Must be aligned to 32B.
|
||||
* @param a Pointer to 32-byte array of pre-calculated constants based on C.
|
||||
* only use 2nd element is used.
|
||||
* @param src Pointer to src data array. Must be aligned to 32B.
|
||||
* @param dest Pointer to destination data array. Must be aligned to 32B.
|
||||
*/
|
||||
|
||||
void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src,
|
||||
unsigned char *dest);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_GF_VECT_MUL_H
|
123
include/reg_sizes.asm
Normal file
123
include/reg_sizes.asm
Normal file
@ -0,0 +1,123 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions
|
||||
; are met:
|
||||
; * Redistributions of source code must retain the above copyright
|
||||
; notice, this list of conditions and the following disclaimer.
|
||||
; * Redistributions in binary form must reproduce the above copyright
|
||||
; notice, this list of conditions and the following disclaimer in
|
||||
; the documentation and/or other materials provided with the
|
||||
; distribution.
|
||||
; * Neither the name of Intel Corporation nor the names of its
|
||||
; contributors may be used to endorse or promote products derived
|
||||
; from this software without specific prior written permission.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
%ifndef _REG_SIZES_ASM_
|
||||
%define _REG_SIZES_ASM_
|
||||
|
||||
%define EFLAGS_HAS_CPUID (1<<21)
|
||||
%define FLAG_CPUID1_ECX_CLMUL (1<<1)
|
||||
%define FLAG_CPUID1_EDX_SSE2 (1<<26)
|
||||
%define FLAG_CPUID1_ECX_SSE3 (1)
|
||||
%define FLAG_CPUID1_ECX_SSE4_1 (1<<19)
|
||||
%define FLAG_CPUID1_ECX_SSE4_2 (1<<20)
|
||||
%define FLAG_CPUID1_ECX_POPCNT (1<<23)
|
||||
%define FLAG_CPUID1_ECX_AESNI (1<<25)
|
||||
%define FLAG_CPUID1_ECX_OSXSAVE (1<<27)
|
||||
%define FLAG_CPUID1_ECX_AVX (1<<28)
|
||||
%define FLAG_CPUID1_EBX_AVX2 (1<<5)
|
||||
%define FLAG_XGETBV_EAX_XMM_YMM 0x6
|
||||
|
||||
%define FLAG_CPUID1_EAX_AVOTON 0x000406d0
|
||||
|
||||
; define d and w variants for registers
|
||||
|
||||
%define raxd eax
|
||||
%define raxw ax
|
||||
%define raxb al
|
||||
|
||||
%define rbxd ebx
|
||||
%define rbxw bx
|
||||
%define rbxb bl
|
||||
|
||||
%define rcxd ecx
|
||||
%define rcxw cx
|
||||
%define rcxb cl
|
||||
|
||||
%define rdxd edx
|
||||
%define rdxw dx
|
||||
%define rdxb dl
|
||||
|
||||
%define rsid esi
|
||||
%define rsiw si
|
||||
%define rsib sil
|
||||
|
||||
%define rdid edi
|
||||
%define rdiw di
|
||||
%define rdib dil
|
||||
|
||||
%define rbpd ebp
|
||||
%define rbpw bp
|
||||
%define rbpb bpl
|
||||
|
||||
%define ymm0x xmm0
|
||||
%define ymm1x xmm1
|
||||
%define ymm2x xmm2
|
||||
%define ymm3x xmm3
|
||||
%define ymm4x xmm4
|
||||
%define ymm5x xmm5
|
||||
%define ymm6x xmm6
|
||||
%define ymm7x xmm7
|
||||
%define ymm8x xmm8
|
||||
%define ymm9x xmm9
|
||||
%define ymm10x xmm10
|
||||
%define ymm11x xmm11
|
||||
%define ymm12x xmm12
|
||||
%define ymm13x xmm13
|
||||
%define ymm14x xmm14
|
||||
%define ymm15x xmm15
|
||||
|
||||
%define DWORD(reg) reg %+ d
|
||||
%define WORD(reg) reg %+ w
|
||||
%define BYTE(reg) reg %+ b
|
||||
|
||||
%define XWORD(reg) reg %+ x
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__,elf32
|
||||
section .note.GNU-stack noalloc noexec nowrite progbits
|
||||
section .text
|
||||
%endif
|
||||
%ifidn __OUTPUT_FORMAT__,elf64
|
||||
section .note.GNU-stack noalloc noexec nowrite progbits
|
||||
section .text
|
||||
%endif
|
||||
%ifidn __OUTPUT_FORMAT__, macho64
|
||||
%define elf64 macho64
|
||||
%endif
|
||||
|
||||
%macro slversion 4
|
||||
section .text
|
||||
global %1_slver_%2%3%4
|
||||
global %1_slver
|
||||
%1_slver:
|
||||
%1_slver_%2%3%4:
|
||||
dw 0x%4
|
||||
db 0x%3, 0x%2
|
||||
%endmacro
|
||||
|
||||
%endif ; ifndef _REG_SIZES_ASM_
|
81
include/test.h
Normal file
81
include/test.h
Normal file
@ -0,0 +1,81 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
|
||||
#ifndef _TEST_H
|
||||
#define _TEST_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Use sys/time.h functions for time
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
struct perf{
|
||||
struct timeval tv;
|
||||
};
|
||||
|
||||
|
||||
inline int perf_start(struct perf *p)
|
||||
{
|
||||
return gettimeofday(&(p->tv), 0);
|
||||
}
|
||||
inline int perf_stop(struct perf *p)
|
||||
{
|
||||
return gettimeofday(&(p->tv), 0);
|
||||
}
|
||||
|
||||
inline void perf_print(struct perf stop, struct perf start, long long dsize)
|
||||
{
|
||||
long long secs = stop.tv.tv_sec - start.tv.tv_sec;
|
||||
long long usecs = secs * 1000000 + stop.tv.tv_usec - start.tv.tv_usec;
|
||||
|
||||
printf("runtime = %10lld usecs", usecs);
|
||||
if (dsize != 0) {
|
||||
#if 1 // not bug in printf for 32-bit
|
||||
printf(", bandwidth %lld MB in %.4f sec = %.2f MB/s\n", dsize/(1024*1024),
|
||||
((double) usecs)/1000000, ((double) dsize) / (double)usecs);
|
||||
#else
|
||||
printf(", bandwidth %lld MB ", dsize/(1024*1024));
|
||||
printf("in %.4f sec ",(double)usecs/1000000);
|
||||
printf("= %.2f MB/s\n", (double)dsize/usecs);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // _TEST_H
|
88
include/types.h
Normal file
88
include/types.h
Normal file
@ -0,0 +1,88 @@
|
||||
/**********************************************************************
|
||||
Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************/
|
||||
|
||||
|
||||
/**
|
||||
* @file types.h
|
||||
* @brief Defines standard width types.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __TYPES_H
|
||||
#define __TYPES_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef __WIN32__
|
||||
#ifdef __MINGW32__
|
||||
# include <_mingw.h>
|
||||
#endif
|
||||
typedef unsigned __int64 UINT64;
|
||||
typedef __int64 INT64;
|
||||
typedef unsigned __int32 UINT32;
|
||||
typedef unsigned __int16 UINT16;
|
||||
typedef unsigned char UINT8;
|
||||
#else
|
||||
typedef unsigned long int UINT64;
|
||||
typedef long int INT64;
|
||||
typedef unsigned int UINT32;
|
||||
typedef unsigned short int UINT16;
|
||||
typedef unsigned char UINT8;
|
||||
#endif
|
||||
|
||||
|
||||
#if defined __unix__ || defined __APPLE__
|
||||
# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
|
||||
# define __forceinline static inline
|
||||
# define aligned_free(x) free(x)
|
||||
#else
|
||||
# ifdef __MINGW32__
|
||||
# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
|
||||
# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
|
||||
# define aligned_free(x) _aligned_free(x)
|
||||
# else
|
||||
# define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl
|
||||
# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
|
||||
# define aligned_free(x) _aligned_free(x)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
# define DEBUG_PRINT(x) printf x
|
||||
#else
|
||||
# define DEBUG_PRINT(x) do {} while (0)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //__TYPES_H
|
56
isa-l.def
Normal file
56
isa-l.def
Normal file
@ -0,0 +1,56 @@
|
||||
LIBRARY isa-l
|
||||
EXPORTS
|
||||
|
||||
ec_encode_data_sse @1
|
||||
ec_init_tables @2
|
||||
gf_gen_cauchy1_matrix @3
|
||||
gf_gen_rs_matrix @4
|
||||
gf_invert_matrix @5
|
||||
gf_mul @6
|
||||
gf_vect_dot_prod_base @7
|
||||
gf_vect_mul_base @8
|
||||
ec_encode_data_base @9
|
||||
gf_vect_mul_init @10
|
||||
gf_vect_mul_sse @11
|
||||
gf_vect_mul_avx @12
|
||||
gf_vect_dot_prod_sse @13
|
||||
gf_vect_dot_prod_avx @14
|
||||
gf_vect_dot_prod_avx2 @15
|
||||
gf_2vect_dot_prod_sse @16
|
||||
gf_3vect_dot_prod_sse @17
|
||||
gf_4vect_dot_prod_sse @18
|
||||
gf_5vect_dot_prod_sse @19
|
||||
gf_6vect_dot_prod_sse @20
|
||||
gf_2vect_dot_prod_avx @21
|
||||
gf_3vect_dot_prod_avx @22
|
||||
gf_4vect_dot_prod_avx @23
|
||||
gf_5vect_dot_prod_avx @24
|
||||
gf_6vect_dot_prod_avx @25
|
||||
gf_2vect_dot_prod_avx2 @26
|
||||
gf_3vect_dot_prod_avx2 @27
|
||||
gf_4vect_dot_prod_avx2 @28
|
||||
gf_5vect_dot_prod_avx2 @29
|
||||
gf_6vect_dot_prod_avx2 @30
|
||||
gf_vect_mad_sse @31
|
||||
gf_2vect_mad_sse @32
|
||||
gf_3vect_mad_sse @33
|
||||
gf_4vect_mad_sse @34
|
||||
gf_5vect_mad_sse @35
|
||||
gf_6vect_mad_sse @36
|
||||
gf_vect_mad_avx @37
|
||||
gf_2vect_mad_avx @38
|
||||
gf_3vect_mad_avx @39
|
||||
gf_4vect_mad_avx @40
|
||||
gf_5vect_mad_avx @41
|
||||
gf_6vect_mad_avx @42
|
||||
gf_vect_mad_avx2 @43
|
||||
gf_2vect_mad_avx2 @44
|
||||
gf_3vect_mad_avx2 @45
|
||||
gf_4vect_mad_avx2 @46
|
||||
gf_5vect_mad_avx2 @47
|
||||
gf_6vect_mad_avx2 @48
|
||||
ec_encode_data @49
|
||||
gf_vect_mul @50
|
||||
ec_encode_data_update @51
|
||||
gf_vect_dot_prod @52
|
||||
gf_vect_mad @53
|
11
libisal.pc.in
Normal file
11
libisal.pc.in
Normal file
@ -0,0 +1,11 @@
|
||||
prefix=@prefix@
|
||||
exec_prefix=@exec_prefix@
|
||||
libdir=@libdir@
|
||||
includedir=@includedir@
|
||||
|
||||
Name: libisal
|
||||
Description: Library for storage systems
|
||||
Version: @VERSION@
|
||||
Libs: -L${libdir} -lisal
|
||||
Libs.private:
|
||||
Cflags: -I${includedir}
|
246
make.inc
Normal file
246
make.inc
Normal file
@ -0,0 +1,246 @@
|
||||
########################################################################
|
||||
# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
########################################################################
|
||||
|
||||
|
||||
# Makefile include for optimized libraries
|
||||
# make targets:
|
||||
# lib - build library of optimized functions
|
||||
# slib - build shared library
|
||||
# test - run unit tests of functions
|
||||
# perf - run performance tests
|
||||
# install - install headers and libs to system location
|
||||
# sim - run on simulator
|
||||
# trace - get simulator trace
|
||||
# clean - remove object files
|
||||
|
||||
version ?= #auto filled on release
|
||||
|
||||
CC = gcc
|
||||
AS = yasm
|
||||
SIM = sde $(SIMFLAGS) --
|
||||
|
||||
DEBUG = -g
|
||||
DEBUG_yasm = -g dwarf2
|
||||
DEBUG_nasm = -g
|
||||
|
||||
# Default arch= build options
|
||||
CFLAGS_gcc = -Wall
|
||||
ASFLAGS_ = -f elf64
|
||||
ARFLAGS_ = cr $@
|
||||
STRIP_gcc = strip -d -R .comment $@
|
||||
|
||||
# arch=32 build options
|
||||
ASFLAGS_32 = -f elf32
|
||||
CFLAGS_32 = -m32
|
||||
ARFLAGS_32 = cr $@
|
||||
|
||||
# arch=win64 build options
|
||||
ASFLAGS_win64 = -f win64
|
||||
CFLAGS_icl = -Qstd=c99
|
||||
ARFLAGS_win64 = -out:$@
|
||||
|
||||
# arch=mingw build options
|
||||
ASFLAGS_mingw = -f win64
|
||||
ARFLAGS_mingw = cr $@
|
||||
lsrcmingw = $(lsrc)
|
||||
unit_testsmingw = $(unit_tests)
|
||||
examplesmingw = $(examples)
|
||||
perf_testsmingw = $(perf_tests)
|
||||
|
||||
ifeq ($(arch),mingw)
|
||||
CC=x86_64-w64-mingw32-gcc
|
||||
AR=x86_64-w64-mingw32-ar
|
||||
LDFLAGS = -Wl,--force-exe-suffix
|
||||
endif
|
||||
|
||||
|
||||
INCLUDE = $(patsubst %,-I%,$(subst :, ,$(VPATH)))
|
||||
CFLAGS = $(CFLAGS_$(arch)) $(CFLAGS_$(CC)) $(DEBUG) -O2 $(DEFINES) $(INCLUDE)
|
||||
ASFLAGS = $(ASFLAGS_$(arch)) $(ASFLAGS_$(CC)) $(DEBUG_$(AS)) $(DEFINES) $(INCLUDE)
|
||||
ARFLAGS = $(ARFLAGS_$(arch))
|
||||
DEFINES += $(addprefix -D , $D)
|
||||
|
||||
O = bin
|
||||
lobj += $(patsubst %.c,%.o,$(patsubst %.asm,%.o,$(lsrc$(arch)) $(lsrc_intrinsic)))
|
||||
objs = $(addprefix $(O)/,$(notdir $(lobj)))
|
||||
|
||||
|
||||
lib_name ?= isa-l.a
|
||||
default: lib slib
|
||||
|
||||
# Defaults for windows build
|
||||
ifeq ($(arch),win64)
|
||||
AR=lib
|
||||
CC=cl
|
||||
OUTPUT_OPTION = -Fo$@
|
||||
DEBUG=
|
||||
lib_name := $(basename $(lib_name)).lib
|
||||
endif
|
||||
lsrcwin64 = $(lsrc)
|
||||
unit_testswin64 = $(unit_tests)
|
||||
exampleswin64 = $(examples)
|
||||
perf_testswin64 = $(perf_tests)
|
||||
|
||||
# Build and run unit tests, performance tests, etc.
|
||||
all_tests = $(notdir $(sort $(perf_tests$(arch)) $(check_tests$(arch)) $(unit_tests$(arch)) $(examples$(arch)) $(other_tests)))
|
||||
all_unit_tests = $(notdir $(sort $(check_tests$(arch)) $(unit_tests$(arch))))
|
||||
|
||||
$(all_unit_tests): % : %.c $(lib_name)
|
||||
$(sort $(notdir $(perf_tests$(arch)))): % : %.c $(lib_name)
|
||||
$(sort $(examples$(arch))): % : %.c $(lib_name)
|
||||
$(sort $(other_tests)): % : %.c $(lib_name)
|
||||
|
||||
sim test trace: $(addsuffix .run,$(all_unit_tests))
|
||||
|
||||
perf: $(addsuffix .run,$(notdir $(perf_tests$(arch))))
|
||||
ex: $(examples$(arch))
|
||||
all: lib $(all_tests)
|
||||
other: $(other_tests)
|
||||
tests: $(all_unit_tests)
|
||||
perfs: $(notdir $(perf_tests$(arch)))
|
||||
check test perf: SIM=
|
||||
trace: SIMFLAGS = -debugtrace
|
||||
check test sim:
|
||||
@echo Finished running $@
|
||||
|
||||
#$(foreach c, $^, ./$c )
|
||||
#for i in $^; do ./$$i ; done
|
||||
|
||||
$(objs): | $(O)
|
||||
$(O): ; mkdir -p $(O)
|
||||
|
||||
|
||||
# Build rule to run tests
|
||||
%.run: %
|
||||
$(SIM) $(@D)/$<
|
||||
@echo Completed run: $<
|
||||
|
||||
# Other build rules
|
||||
msg = $(if $(DEBUG),DEBUG) $(patsubst 32,32-bit,$(arch)) $D
|
||||
|
||||
$(O)/%.o: %.asm
|
||||
@echo " ---> Building $< $(msg)"
|
||||
@$(AS) $(ASFLAGS) -o $@ $<
|
||||
|
||||
$(O)/%.o %.o: %.c
|
||||
@echo " ---> Building $< $(msg)"
|
||||
@$(COMPILE.c) $(OUTPUT_OPTION) $<
|
||||
|
||||
$(all_tests):
|
||||
@echo " ---> Building Test $@ $(msg)"
|
||||
@$(LINK.o) $(CFLAGS) $^ $(LDLIBS) -o $@
|
||||
|
||||
|
||||
# Target to build lib files
|
||||
lib: $(lib_name)
|
||||
ifneq ($(lib_debug),1)
|
||||
$(lib_name): DEBUG_$(AS)= # Don't put debug symbols in the lib
|
||||
$(lib_name): DEBUG=
|
||||
$(lib_name): DEFINES+=-D NDEBUG
|
||||
endif
|
||||
ifeq ($(lib_debug),1)
|
||||
DEBUG+=-D DEBUG # Define DEBUG for macros
|
||||
endif
|
||||
|
||||
#lib $(lib_name): $(lib_name)(${objs})
|
||||
$(lib_name): $(objs)
|
||||
@echo " ---> Creating Lib $@"
|
||||
@$(AR) $(ARFLAGS) $^
|
||||
@$(STRIP_$(CC))
|
||||
|
||||
|
||||
# Target for shared lib
|
||||
so_lib_name = bin/libisal.so
|
||||
so_lib_inst = $(notdir $(so_lib_name))
|
||||
so_lib_ver = $(so_lib_inst).$(version)
|
||||
soname = $(so_lib_inst).$(word 1, $(subst ., ,$(version)))
|
||||
|
||||
slib: $(so_lib_name)
|
||||
aobjs += $(addprefix $(O)/,$(patsubst %.asm,%.o,$(filter %.asm,$(notdir $(lsrc$(arch)) $(lsrc_intrinsic)))))
|
||||
shared_objs += $(addprefix $(O)/shared_ver_,$(patsubst %.c,%.o,$(filter %.c,$(notdir $(lsrc$(arch)) $(lsrc_intrinsic)))))
|
||||
|
||||
$(O)/shared_ver_%.o: %.c
|
||||
@echo " ---> Building shared $< $(msg)"
|
||||
@$(COMPILE.c) $(OUTPUT_OPTION) $<
|
||||
|
||||
ifneq ($(lib_debug),1)
|
||||
$(so_lib_name): DEBUG_$(AS)=
|
||||
$(so_lib_name): DEBUG=
|
||||
$(so_lib_name): DEFINES+=-D NDEBUG
|
||||
endif
|
||||
|
||||
$(shared_objs): CFLAGS += -fPIC
|
||||
$(shared_objs) $(aobjs): | $(O)
|
||||
$(so_lib_name): LDFLAGS+=-Wl,-soname,$(soname)
|
||||
$(so_lib_name): $(shared_objs) $(aobjs)
|
||||
@echo " ---> Creating Shared Lib $@"
|
||||
@$(CC) $(CFLAGS) --shared $(LDFLAGS) -o $@ $^
|
||||
@(cd $(@D); ln -f -s $(so_lib_inst) $(soname))
|
||||
|
||||
# Target for install
|
||||
prefix = /usr/local
|
||||
install_dirs = $(prefix)/lib $(prefix)/include/isa-l
|
||||
$(install_dirs): ; mkdir -p $@
|
||||
install: $(sort $(extern_hdrs)) | $(install_dirs) $(lib_name) $(so_lib_name) isa-l.h
|
||||
install -m 644 $(lib_name) $(prefix)/lib/libisal.a
|
||||
install -m 644 $^ $(prefix)/include/isa-l/.
|
||||
install -m 664 include/isa-l.h $(prefix)/include/.
|
||||
install -m 664 $(so_lib_name) $(prefix)/lib/$(so_lib_ver)
|
||||
(cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(soname) && ln -f -s $(so_lib_ver) $(so_lib_inst))
|
||||
ifeq ($(shell uname),Darwin)
|
||||
(cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(basename $(so_lib_inst)).dylib)
|
||||
endif
|
||||
which libtool && libtool --mode=finish $(prefix)/lib || \
|
||||
echo 'Lib installed at $(prefix)/lib. Run system-dependent programs to add shared lib path.'
|
||||
|
||||
uninstall:
|
||||
$(RM) $(prefix)/lib/libisal.a
|
||||
$(RM) $(prefix)/lib/$(soname)
|
||||
$(RM) $(prefix)/lib/$(so_lib_ver)
|
||||
$(RM) $(prefix)/lib/$(so_lib_inst)
|
||||
$(RM) -r $(prefix)/include/isa-l
|
||||
$(RM) $(prefix)/include/isa-l.h
|
||||
$(RM) $(prefix)/lib/$(basename $(so_lib_inst)).dylib
|
||||
|
||||
# Collect performance data
|
||||
rpt_name = perf_report_$(shell uname -n)_$(shell date +%y%m%d).perf
|
||||
|
||||
perf_report:
|
||||
echo Results for $(rpt_name) >> $(rpt_name)
|
||||
$(MAKE) -k perf | tee -a $(rpt_name)
|
||||
@echo Summary:
|
||||
-grep runtime $(rpt_name)
|
||||
|
||||
|
||||
clean:
|
||||
@echo Cleaning up
|
||||
@$(RM) -r $(O) *.o *.a $(all_tests) $(lib_name) $(so_lib_name)
|
||||
|
||||
|
||||
|
31
tools/yasm-filter.sh
Executable file
31
tools/yasm-filter.sh
Executable file
@ -0,0 +1,31 @@
|
||||
#/bin/sh
|
||||
|
||||
# Filter out unnecessary options added by automake
|
||||
|
||||
while [ -n "$*" ]; do
|
||||
case "$1" in
|
||||
-f | -o | -I | -i | -D )
|
||||
# Supported options with arg
|
||||
options="$options $1 $2"
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-I* | -i* | --prefix* )
|
||||
# Supported options without arg
|
||||
options="$options $1"
|
||||
shift
|
||||
;;
|
||||
#-blah )
|
||||
# Unsupported options with args - none known
|
||||
-* )
|
||||
# Unsupported options with no args
|
||||
shift
|
||||
;;
|
||||
* )
|
||||
args="$args $1"
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
yasm $options $args
|
Loading…
Reference in New Issue
Block a user