enable VSX SIMD in ISA-L for ppc64le

1) Implement the ErasureCode function in Altivec Intrinsics 2) Coding style update Change-Id: I2c81d035f4083e9b011dbf3b741f628813b68606 Thanks-to: Daniel Axtens <dja@axtens.net> Signed-off-by: Hong Bo Peng <penghb@cn.ibm.com>
2024-12-12 09:23:50 +01:00 · 2020-02-20 11:47:53 +08:00 · 2020-02-20 11:47:53 +08:00 · 180c74aefd
commit 180c74aefd
parent a3d5cd8642
23 changed files with 1798 additions and 0 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -27,9 +27,11 @@ other_tests=
 other_tests_x86_64=
 other_tests_x86_32=
 other_tests_aarch64=
 other_tests_ppc64le=
 lsrc_x86_64=
 lsrc_x86_32=
 lsrc_aarch64=
 lsrc_ppc64le=
 lsrc_base_aliases=
 lsrc32=
 unit_tests32=
@ -71,6 +73,11 @@ libisal_la_SOURCES += ${lsrc_aarch64}
 other_tests += ${other_tests_aarch64}
 endif
 if CPU_PPC64LE
 libisal_la_SOURCES += ${lsrc_ppc64le}
 other_tests += ${other_tests_ppc64le}
 endif
 if CPU_UNDEFINED
 libisal_la_SOURCES += ${lsrc_base_aliases}
 endif
--- a/configure.ac
+++ b/configure.ac
@ -30,10 +30,13 @@ AS_CASE([$host_cpu],
  [i?86], [CPU="x86_32"],
  [aarch64], [CPU="aarch64"],
  [arm64], [CPU="aarch64"],
  [powerpc64le], [CPU="ppc64le"],
  [ppc64le], [CPU="ppc64le"],
 )
 AM_CONDITIONAL([CPU_X86_64], [test "$CPU" = "x86_64"])
 AM_CONDITIONAL([CPU_X86_32], [test "$CPU" = "x86_32"])
 AM_CONDITIONAL([CPU_AARCH64], [test "$CPU" = "aarch64"])
 AM_CONDITIONAL([CPU_PPC64LE], [test "$CPU" = "ppc64le"])
 AM_CONDITIONAL([CPU_UNDEFINED], [test "x$CPU" = "x"])
 if test "$CPU" = "x86_64"; then
--- a/crc/Makefile.am
+++ b/crc/Makefile.am
@ -35,6 +35,7 @@ lsrc  += \
 lsrc_base_aliases += crc/crc_base_aliases.c
 lsrc_x86_32       += crc/crc_base_aliases.c
 lsrc_ppc64le      += crc/crc_base_aliases.c
 lsrc_x86_64 += \
 	crc/crc16_t10dif_01.asm \
--- a/erasure_code/Makefile.am
+++ b/erasure_code/Makefile.am
@ -29,6 +29,8 @@
 include erasure_code/aarch64/Makefile.am
 include erasure_code/ppc64le/Makefile.am
 lsrc         += erasure_code/ec_base.c
 lsrc_base_aliases += erasure_code/ec_base_aliases.c
--- a/erasure_code/ppc64le/Makefile.am
+++ b/erasure_code/ppc64le/Makefile.am
@ -0,0 +1,15 @@
 lsrc_ppc64le      += erasure_code/ppc64le/ec_base_vsx.c \
 		erasure_code/ppc64le/gf_vect_mul_vsx.c \
 		erasure_code/ppc64le/gf_vect_dot_prod_vsx.c \
 		erasure_code/ppc64le/gf_vect_mad_vsx.c \
 		erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c \
 		erasure_code/ppc64le/gf_2vect_mad_vsx.c \
 		erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c \
 		erasure_code/ppc64le/gf_3vect_mad_vsx.c \
 		erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c \
 		erasure_code/ppc64le/gf_4vect_mad_vsx.c \
 		erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c \
 		erasure_code/ppc64le/gf_5vect_mad_vsx.c \
 		erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c \
 		erasure_code/ppc64le/gf_6vect_mad_vsx.c
--- a/erasure_code/ppc64le/ec_base_vsx.c
+++ b/erasure_code/ppc64le/ec_base_vsx.c
@ -0,0 +1,97 @@
 #include "erasure_code.h"
 #include "ec_base_vsx.h"
 void gf_vect_dot_prod(int len, int vlen, unsigned char *v,
 		      unsigned char **src, unsigned char *dest)
 {
 	gf_vect_dot_prod_vsx(len, vlen, v, src, dest);
 }
 void gf_vect_mad(int len, int vec, int vec_i, unsigned char *v,
 		 unsigned char *src, unsigned char *dest)
 {
 	gf_vect_mad_vsx(len, vec, vec_i, v, src, dest);
 }
 void ec_encode_data(int len, int srcs, int dests, unsigned char *v,
 		    unsigned char **src, unsigned char **dest)
 {
 	if (len < 64) {
 		ec_encode_data_base(len, srcs, dests, v, src, dest);
 		return;
 	}
 	while (dests >= 6) {
 		gf_6vect_dot_prod_vsx(len, srcs, v, src, dest);
 		v += 6 * srcs * 32;
 		dest += 6;
 		dests -= 6;
 	}
 	switch (dests) {
 	case 6:
 		gf_6vect_dot_prod_vsx(len, srcs, v, src, dest);
 		break;
 	case 5:
 		gf_5vect_dot_prod_vsx(len, srcs, v, src, dest);
 		break;
 	case 4:
 		gf_4vect_dot_prod_vsx(len, srcs, v, src, dest);
 		break;
 	case 3:
 		gf_3vect_dot_prod_vsx(len, srcs, v, src, dest);
 		break;
 	case 2:
 		gf_2vect_dot_prod_vsx(len, srcs, v, src, dest);
 		break;
 	case 1:
 		gf_vect_dot_prod_vsx(len, srcs, v, src, *dest);
 		break;
 	case 0:
 		break;
 	}
 }
 void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *v,
 			   unsigned char *data, unsigned char **dest)
 {
 	if (len < 64) {
 		ec_encode_data_update_base(len, k, rows, vec_i, v, data, dest);
 		return;
 	}
 	while (rows >= 6) {
 		gf_6vect_mad_vsx(len, k, vec_i, v, data, dest);
 		v += 6 * k * 32;
 		dest += 6;
 		rows -= 6;
 	}
 	switch (rows) {
 	case 6:
 		gf_6vect_mad_vsx(len, k, vec_i, v, data, dest);
 		break;
 	case 5:
 		gf_5vect_mad_vsx(len, k, vec_i, v, data, dest);
 		break;
 	case 4:
 		gf_4vect_mad_vsx(len, k, vec_i, v, data, dest);
 		break;
 	case 3:
 		gf_3vect_mad_vsx(len, k, vec_i, v, data, dest);
 		break;
 	case 2:
 		gf_2vect_mad_vsx(len, k, vec_i, v, data, dest);
 		break;
 	case 1:
 		gf_vect_mad_vsx(len, k, vec_i, v, data, *dest);
 		break;
 	case 0:
 		break;
 	}
 }
 int gf_vect_mul(int len, unsigned char *a, void *src, void *dest)
 {
 	gf_vect_mul_vsx(len, a, (unsigned char *)src, (unsigned char *)dest);
 	return 0;
 }
--- a/erasure_code/ppc64le/ec_base_vsx.h
+++ b/erasure_code/ppc64le/ec_base_vsx.h
@ -0,0 +1,338 @@
 #ifndef _ERASURE_CODE_PPC64LE_H_
 #define _ERASURE_CODE_PPC64LE_H_
 #include "erasure_code.h"
 #include <altivec.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 #if defined(__ibmxl__)
 #define EC_vec_xl(a, b) vec_xl_be(a, b)
 #define EC_vec_permxor(va, vb, vc) __vpermxor(va, vb, vc)
 #elif defined __GNUC__ && __GNUC__ >= 8
 #define EC_vec_xl(a, b) vec_xl_be(a, b)
 #define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vc)
 #elif defined __GNUC__ && __GNUC__ >= 7
 #if defined _ARCH_PWR9
 #define EC_vec_xl(a, b) vec_vsx_ld(a, b)
 #define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
 #else
 inline vector unsigned char EC_vec_xl(int off, unsigned char *ptr) {
 	vector unsigned char vc;
 	__asm__ __volatile__("lxvd2x %x0, %1, %2; xxswapd %x0, %x0" : "=wa" (vc) : "r" (off), "r" (ptr));
 	return vc;
 }
 #define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
 #endif
 #else
 #if defined _ARCH_PWR8
 inline vector unsigned char EC_vec_xl(int off, unsigned char *ptr) {
 	vector unsigned char vc;
 	__asm__ __volatile__("lxvd2x %x0, %1, %2; xxswapd %x0, %x0" : "=wa" (vc) : "r" (off), "r" (ptr));
 	return vc;
 }
 #define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
 #else
 #error "This code is only supported on ppc64le."
 #endif
 #endif
 /**
 * @brief GF(2^8) vector multiply. VSX version.
 *
 * Does a GF(2^8) multiply across each byte of input source with expanded
 * constant and save to destination array. Can be used for erasure coding encode
 * and decode update when only one source is available at a time. Function
 * requires pre-calculation of a 32 byte constant array based on the input
 * coefficients.
 * @requires VSX
 *
 * @param len    Length of each vector in bytes.
 * @param gftbls Pointer to array of input tables generated from coding
 * 		 coefficients in ec_init_tables(). Must be of size 32.
 * @param src    Array of pointers to source inputs.
 * @param dest   Pointer to destination data array.
 * @returns none
 */
 void gf_vect_mul_vsx(int len, unsigned char *gftbls, unsigned char *src, unsigned char *dest);
 /**
 * @brief GF(2^8) vector dot product. VSX version.
 *
 * Does a GF(2^8) dot product across each byte of the input array and a constant
 * set of coefficients to produce each byte of the output. Can be used for
 * erasure coding encode and decode. Function requires pre-calculation of a
 * 32*vlen byte constant array based on the input coefficients.
 * @requires VSX
 *
 * @param len    Length of each vector in bytes.
 * @param vlen   Number of vector sources.
 * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
 *               on the array of input coefficients.
 * @param src    Array of pointers to source inputs.
 * @param dest   Pointer to destination data array.
 * @returns none
 */
 void gf_vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
 			  unsigned char **src, unsigned char *dest);
 /**
 * @brief GF(2^8) vector dot product with two outputs. VSX version.
 *
 * Vector dot product optimized to calculate two outputs at a time. Does two
 * GF(2^8) dot products across each byte of the input array and two constant
 * sets of coefficients to produce each byte of the outputs. Can be used for
 * erasure coding encode and decode. Function requires pre-calculation of a
 * 2*32*vlen byte constant array based on the two sets of input coefficients.
 * @requires VSX
 *
 * @param len    Length of each vector in bytes.
 * @param vlen   Number of vector sources.
 * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
 *               based on the array of input coefficients.
 * @param src    Array of pointers to source inputs.
 * @param dest   Array of pointers to destination data buffers.
 * @returns none
 */
 void gf_2vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
 			   unsigned char **src, unsigned char **dest);
 /**
 * @brief GF(2^8) vector dot product with three outputs. VSX version.
 *
 * Vector dot product optimized to calculate three outputs at a time. Does three
 * GF(2^8) dot products across each byte of the input array and three constant
 * sets of coefficients to produce each byte of the outputs. Can be used for
 * erasure coding encode and decode. Function requires pre-calculation of a
 * 3*32*vlen byte constant array based on the three sets of input coefficients.
 * @requires VSX
 *
 * @param len    Length of each vector in bytes.
 * @param vlen   Number of vector sources.
 * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
 *               based on the array of input coefficients.
 * @param src    Array of pointers to source inputs.
 * @param dest   Array of pointers to destination data buffers.
 * @returns none
 */
 void gf_3vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
 			   unsigned char **src, unsigned char **dest);
 /**
 * @brief GF(2^8) vector dot product with four outputs. VSX version.
 *
 * Vector dot product optimized to calculate four outputs at a time. Does four
 * GF(2^8) dot products across each byte of the input array and four constant
 * sets of coefficients to produce each byte of the outputs. Can be used for
 * erasure coding encode and decode. Function requires pre-calculation of a
 * 4*32*vlen byte constant array based on the four sets of input coefficients.
 * @requires VSX
 *
 * @param len    Length of each vector in bytes.
 * @param vlen   Number of vector sources.
 * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
 *               based on the array of input coefficients.
 * @param src    Array of pointers to source inputs.
 * @param dest   Array of pointers to destination data buffers.
 * @returns none
 */
 void gf_4vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
 			   unsigned char **src, unsigned char **dest);
 /**
 * @brief GF(2^8) vector dot product with five outputs. VSX version.
 *
 * Vector dot product optimized to calculate five outputs at a time. Does five
 * GF(2^8) dot products across each byte of the input array and five constant
 * sets of coefficients to produce each byte of the outputs. Can be used for
 * erasure coding encode and decode. Function requires pre-calculation of a
 * 5*32*vlen byte constant array based on the five sets of input coefficients.
 * @requires VSX
 *
 * @param len    Length of each vector in bytes. Must >= 16.
 * @param vlen   Number of vector sources.
 * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
 *               based on the array of input coefficients.
 * @param src    Array of pointers to source inputs.
 * @param dest   Array of pointers to destination data buffers.
 * @returns none
 */
 void gf_5vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
 			   unsigned char **src, unsigned char **dest);
 /**
 * @brief GF(2^8) vector dot product with six outputs. VSX version.
 *
 * Vector dot product optimized to calculate six outputs at a time. Does six
 * GF(2^8) dot products across each byte of the input array and six constant
 * sets of coefficients to produce each byte of the outputs. Can be used for
 * erasure coding encode and decode. Function requires pre-calculation of a
 * 6*32*vlen byte constant array based on the six sets of input coefficients.
 * @requires VSX
 *
 * @param len    Length of each vector in bytes.
 * @param vlen   Number of vector sources.
 * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
 *               based on the array of input coefficients.
 * @param src    Array of pointers to source inputs.
 * @param dest   Array of pointers to destination data buffers.
 * @returns none
 */
 void gf_6vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
 			   unsigned char **src, unsigned char **dest);
 /**
 * @brief GF(2^8) vector multiply accumulate. VSX version.
 *
 * Does a GF(2^8) multiply across each byte of input source with expanded
 * constant and add to destination array. Can be used for erasure coding encode
 * and decode update when only one source is available at a time. Function
 * requires pre-calculation of a 32*vec byte constant array based on the input
 * coefficients.
 * @requires VSX
 *
 * @param len    Length of each vector in bytes.
 * @param vec    The number of vector sources or rows in the generator matrix
 * 		 for coding.
 * @param vec_i  The vector index corresponding to the single input source.
 * @param gftbls Pointer to array of input tables generated from coding
 * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
 * @param src    Array of pointers to source inputs.
 * @param dest   Pointer to destination data array.
 * @returns none
 */
 void gf_vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
 		     unsigned char *dest);
 /**
 * @brief GF(2^8) vector multiply with 2 accumulate. VSX version.
 *
 * Does a GF(2^8) multiply across each byte of input source with expanded
 * constants and add to destination arrays. Can be used for erasure coding
 * encode and decode update when only one source is available at a
 * time. Function requires pre-calculation of a 32*vec byte constant array based
 * on the input coefficients.
 * @requires VSX
 *
 * @param len    Length of each vector in bytes.
 * @param vec    The number of vector sources or rows in the generator matrix
 * 		 for coding.
 * @param vec_i  The vector index corresponding to the single input source.
 * @param gftbls Pointer to array of input tables generated from coding
 * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
 * @param src    Pointer to source input array.
 * @param dest   Array of pointers to destination input/outputs.
 * @returns none
 */
 void gf_2vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
 		      unsigned char **dest);
 /**
 * @brief GF(2^8) vector multiply with 3 accumulate. VSX version.
 *
 * Does a GF(2^8) multiply across each byte of input source with expanded
 * constants and add to destination arrays. Can be used for erasure coding
 * encode and decode update when only one source is available at a
 * time. Function requires pre-calculation of a 32*vec byte constant array based
 * on the input coefficients.
 * @requires VSX
 *
 * @param len    Length of each vector in bytes.
 * @param vec    The number of vector sources or rows in the generator matrix
 * 		 for coding.
 * @param vec_i  The vector index corresponding to the single input source.
 * @param gftbls Pointer to array of input tables generated from coding
 * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
 * @param src    Pointer to source input array.
 * @param dest   Array of pointers to destination input/outputs.
 * @returns none
 */
 void gf_3vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
 		      unsigned char **dest);
 /**
 * @brief GF(2^8) vector multiply with 4 accumulate. VSX version.
 *
 * Does a GF(2^8) multiply across each byte of input source with expanded
 * constants and add to destination arrays. Can be used for erasure coding
 * encode and decode update when only one source is available at a
 * time. Function requires pre-calculation of a 32*vec byte constant array based
 * on the input coefficients.
 * @requires VSX
 *
 * @param len    Length of each vector in bytes.
 * @param vec    The number of vector sources or rows in the generator matrix
 * 		 for coding.
 * @param vec_i  The vector index corresponding to the single input source.
 * @param gftbls Pointer to array of input tables generated from coding
 * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
 * @param src    Pointer to source input array.
 * @param dest   Array of pointers to destination input/outputs.
 * @returns none
 */
 void gf_4vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
 		      unsigned char **dest);
 /**
 * @brief GF(2^8) vector multiply with 5 accumulate. VSX version.
 *
 * Does a GF(2^8) multiply across each byte of input source with expanded
 * constants and add to destination arrays. Can be used for erasure coding
 * encode and decode update when only one source is available at a
 * time. Function requires pre-calculation of a 32*vec byte constant array based
 * on the input coefficients.
 * @requires VSX
 *
 * @param len    Length of each vector in bytes.
 * @param vec    The number of vector sources or rows in the generator matrix
 * 		 for coding.
 * @param vec_i  The vector index corresponding to the single input source.
 * @param gftbls Pointer to array of input tables generated from coding
 * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
 * @param src    Pointer to source input array.
 * @param dest   Array of pointers to destination input/outputs.
 * @returns none
 */
 void gf_5vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
 		      unsigned char **dest);
 /**
 * @brief GF(2^8) vector multiply with 6 accumulate. VSX version.
 *
 * Does a GF(2^8) multiply across each byte of input source with expanded
 * constants and add to destination arrays. Can be used for erasure coding
 * encode and decode update when only one source is available at a
 * time. Function requires pre-calculation of a 32*vec byte constant array based
 * on the input coefficients.
 * @requires VSX
 *
 * @param len    Length of each vector in bytes.
 * @param vec    The number of vector sources or rows in the generator matrix
 * 		 for coding.
 * @param vec_i  The vector index corresponding to the single input source.
 * @param gftbls Pointer to array of input tables generated from coding
 * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
 * @param src    Pointer to source input array.
 * @param dest   Array of pointers to destination input/outputs.
 * @returns none
 */
 void gf_6vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
 		      unsigned char **dest);
 #ifdef __cplusplus
 }
 #endif
 #endif //_ERASURE_CODE_PPC64LE_H_
--- a/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c
+++ b/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c
@ -0,0 +1,83 @@
 #include "ec_base_vsx.h"
 void gf_2vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
 			   unsigned char **src, unsigned char **dest)
 {
 	unsigned char *s, *t0, *t1;
 	vector unsigned char vX1, vX2, vX3, vX4;
 	vector unsigned char vY1, vY2, vY3, vY4;
 	vector unsigned char vYD, vYE, vYF, vYG;
 	vector unsigned char vhi0, vlo0, vhi1, vlo1;
 	int i, j, head;
 	if (vlen < 128) {
 		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
 		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
 		for (j = 1; j < vlen; j++) {
 			gf_2vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
 		}
 		return;
 	}
 	t0 = (unsigned char *)dest[0];
 	t1 = (unsigned char *)dest[1];
 	head = len % 64;
 	if (head != 0) {
 		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
 	}
 	for (i = head; i < len - 63; i += 64) {
 		vY1 = vY1 ^ vY1;
 		vY2 = vY2 ^ vY2;
 		vY3 = vY3 ^ vY3;
 		vY4 = vY4 ^ vY4;
 		vYD = vYD ^ vYD;
 		vYE = vYE ^ vYE;
 		vYF = vYF ^ vYF;
 		vYG = vYG ^ vYG;
 		unsigned char *g0 = &gftbls[0 * 32 * vlen];
 		unsigned char *g1 = &gftbls[1 * 32 * vlen];
 		for (j = 0; j < vlen; j++) {
 			s = (unsigned char *)src[j];
 			vX1 = vec_xl(0, s + i);
 			vX2 = vec_xl(16, s + i);
 			vX3 = vec_xl(32, s + i);
 			vX4 = vec_xl(48, s + i);
 			vlo0 = EC_vec_xl(0, g0);
 			vhi0 = EC_vec_xl(16, g0);
 			vlo1 = EC_vec_xl(0, g1);
 			vhi1 = EC_vec_xl(16, g1);
 			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
 			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
 			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
 			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
 			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
 			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
 			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
 			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
 			g0 += 32;
 			g1 += 32;
 		}
 		vec_xst(vY1, 0, t0 + i);
 		vec_xst(vY2, 16, t0 + i);
 		vec_xst(vY3, 0, t1 + i);
 		vec_xst(vY4, 16, t1 + i);
 		vec_xst(vYD, 32, t0 + i);
 		vec_xst(vYE, 48, t0 + i);
 		vec_xst(vYF, 32, t1 + i);
 		vec_xst(vYG, 48, t1 + i);
 	}
 	return;
 }
--- a/erasure_code/ppc64le/gf_2vect_mad_vsx.c
+++ b/erasure_code/ppc64le/gf_2vect_mad_vsx.c
@ -0,0 +1,65 @@
 #include "ec_base_vsx.h"
 void gf_2vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
 		      unsigned char *src, unsigned char **dest)
 {
 	unsigned char *s, *t0, *t1;
 	vector unsigned char vX1, vX2, vX3, vX4;
 	vector unsigned char vY1, vY2, vY3, vY4;
 	vector unsigned char vYD, vYE, vYF, vYG;
 	vector unsigned char vhi0, vlo0, vhi1, vlo1;
 	int i, head;
 	s = (unsigned char *)src;
 	t0 = (unsigned char *)dest[0];
 	t1 = (unsigned char *)dest[1];
 	head = len % 64;
 	if (head != 0) {
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
 	}
 	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
 	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
 	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
 	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
 	for (i = head; i < len - 63; i += 64) {
 		vX1 = vec_xl(0, s + i);
 		vX2 = vec_xl(16, s + i);
 		vX3 = vec_xl(32, s + i);
 		vX4 = vec_xl(48, s + i);
 		vY1 = vec_xl(0, t0 + i);
 		vY2 = vec_xl(16, t0 + i);
 		vYD = vec_xl(32, t0 + i);
 		vYE = vec_xl(48, t0 + i);
 		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
 		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
 		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
 		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
 		vY3 = vec_xl(0, t1 + i);
 		vY4 = vec_xl(16, t1 + i);
 		vYF = vec_xl(32, t1 + i);
 		vYG = vec_xl(48, t1 + i);
 		vec_xst(vY1, 0, t0 + i);
 		vec_xst(vY2, 16, t0 + i);
 		vec_xst(vYD, 32, t0 + i);
 		vec_xst(vYE, 48, t0 + i);
 		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
 		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
 		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
 		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
 		vec_xst(vY3, 0, t1 + i);
 		vec_xst(vY4, 16, t1 + i);
 		vec_xst(vYF, 32, t1 + i);
 		vec_xst(vYG, 48, t1 + i);
 	}
 	return;
 }
--- a/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c
+++ b/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c
@ -0,0 +1,104 @@
 #include "ec_base_vsx.h"
 void gf_3vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
 			   unsigned char **src, unsigned char **dest)
 {
 	unsigned char *s, *t0, *t1, *t2;
 	vector unsigned char vX1, vX2, vX3, vX4;
 	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6;
 	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI;
 	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
 	int i, j, head;
 	if (vlen < 128) {
 		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
 		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
 		gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
 		for (j = 1; j < vlen; j++) {
 			gf_3vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
 		}
 		return;
 	}
 	t0 = (unsigned char *)dest[0];
 	t1 = (unsigned char *)dest[1];
 	t2 = (unsigned char *)dest[2];
 	head = len % 64;
 	if (head != 0) {
 		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
 	}
 	for (i = head; i < len - 63; i += 64) {
 		vY1 = vY1 ^ vY1;
 		vY2 = vY2 ^ vY2;
 		vY3 = vY3 ^ vY3;
 		vY4 = vY4 ^ vY4;
 		vY5 = vY5 ^ vY5;
 		vY6 = vY6 ^ vY6;
 		vYD = vYD ^ vYD;
 		vYE = vYE ^ vYE;
 		vYF = vYF ^ vYF;
 		vYG = vYG ^ vYG;
 		vYH = vYH ^ vYH;
 		vYI = vYI ^ vYI;
 		unsigned char *g0 = &gftbls[0 * 32 * vlen];
 		unsigned char *g1 = &gftbls[1 * 32 * vlen];
 		unsigned char *g2 = &gftbls[2 * 32 * vlen];
 		for (j = 0; j < vlen; j++) {
 			s = (unsigned char *)src[j];
 			vX1 = vec_xl(0, s + i);
 			vX2 = vec_xl(16, s + i);
 			vX3 = vec_xl(32, s + i);
 			vX4 = vec_xl(48, s + i);
 			vlo0 = EC_vec_xl(0, g0);
 			vhi0 = EC_vec_xl(16, g0);
 			vlo1 = EC_vec_xl(0, g1);
 			vhi1 = EC_vec_xl(16, g1);
 			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
 			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
 			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
 			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
 			vlo2 = vec_xl(0, g2);
 			vhi2 = vec_xl(16, g2);
 			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
 			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
 			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
 			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
 			vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
 			vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
 			vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
 			vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
 			g0 += 32;
 			g1 += 32;
 			g2 += 32;
 		}
 		vec_xst(vY1, 0, t0 + i);
 		vec_xst(vY2, 16, t0 + i);
 		vec_xst(vY3, 0, t1 + i);
 		vec_xst(vY4, 16, t1 + i);
 		vec_xst(vY5, 0, t2 + i);
 		vec_xst(vY6, 16, t2 + i);
 		vec_xst(vYD, 32, t0 + i);
 		vec_xst(vYE, 48, t0 + i);
 		vec_xst(vYF, 32, t1 + i);
 		vec_xst(vYG, 48, t1 + i);
 		vec_xst(vYH, 32, t2 + i);
 		vec_xst(vYI, 48, t2 + i);
 	}
 	return;
 }
--- a/erasure_code/ppc64le/gf_3vect_mad_vsx.c
+++ b/erasure_code/ppc64le/gf_3vect_mad_vsx.c
@ -0,0 +1,84 @@
 #include "ec_base_vsx.h"
 void gf_3vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
 		      unsigned char *src, unsigned char **dest)
 {
 	unsigned char *s, *t0, *t1, *t2;
 	vector unsigned char vX1, vX2, vX3, vX4;
 	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6;
 	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI;
 	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
 	int i, head;
 	s = (unsigned char *)src;
 	t0 = (unsigned char *)dest[0];
 	t1 = (unsigned char *)dest[1];
 	t2 = (unsigned char *)dest[2];
 	head = len % 64;
 	if (head != 0) {
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
 	}
 	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
 	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
 	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
 	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
 	vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
 	vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
 	for (i = head; i < len - 63; i += 64) {
 		vX1 = vec_xl(0, s + i);
 		vX2 = vec_xl(16, s + i);
 		vX3 = vec_xl(32, s + i);
 		vX4 = vec_xl(48, s + i);
 		vY1 = vec_xl(0, t0 + i);
 		vY2 = vec_xl(16, t0 + i);
 		vYD = vec_xl(32, t0 + i);
 		vYE = vec_xl(48, t0 + i);
 		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
 		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
 		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
 		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
 		vY3 = vec_xl(0, t1 + i);
 		vY4 = vec_xl(16, t1 + i);
 		vYF = vec_xl(32, t1 + i);
 		vYG = vec_xl(48, t1 + i);
 		vec_xst(vY1, 0, t0 + i);
 		vec_xst(vY2, 16, t0 + i);
 		vec_xst(vYD, 32, t0 + i);
 		vec_xst(vYE, 48, t0 + i);
 		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
 		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
 		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
 		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
 		vY5 = vec_xl(0, t2 + i);
 		vY6 = vec_xl(16, t2 + i);
 		vYH = vec_xl(32, t2 + i);
 		vYI = vec_xl(48, t2 + i);
 		vec_xst(vY3, 0, t1 + i);
 		vec_xst(vY4, 16, t1 + i);
 		vec_xst(vYF, 32, t1 + i);
 		vec_xst(vYG, 48, t1 + i);
 		vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
 		vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
 		vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
 		vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
 		vec_xst(vY5, 0, t2 + i);
 		vec_xst(vY6, 16, t2 + i);
 		vec_xst(vYH, 32, t2 + i);
 		vec_xst(vYI, 48, t2 + i);
 	}
 	return;
 }
--- a/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c
+++ b/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c
@ -0,0 +1,124 @@
 #include "ec_base_vsx.h"
 void gf_4vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
 			   unsigned char **src, unsigned char **dest)
 {
 	unsigned char *s, *t0, *t1, *t2, *t3;
 	vector unsigned char vX1, vX2, vX3, vX4;
 	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8;
 	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK;
 	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3;
 	int i, j, head;
 	if (vlen < 128) {
 		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
 		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
 		gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
 		gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
 		for (j = 1; j < vlen; j++) {
 			gf_4vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
 		}
 		return;
 	}
 	t0 = (unsigned char *)dest[0];
 	t1 = (unsigned char *)dest[1];
 	t2 = (unsigned char *)dest[2];
 	t3 = (unsigned char *)dest[3];
 	head = len % 64;
 	if (head != 0) {
 		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
 	}
 	for (i = head; i < len - 63; i += 64) {
 		vY1 = vY1 ^ vY1;
 		vY2 = vY2 ^ vY2;
 		vY3 = vY3 ^ vY3;
 		vY4 = vY4 ^ vY4;
 		vY5 = vY5 ^ vY5;
 		vY6 = vY6 ^ vY6;
 		vY7 = vY7 ^ vY7;
 		vY8 = vY8 ^ vY8;
 		vYD = vYD ^ vYD;
 		vYE = vYE ^ vYE;
 		vYF = vYF ^ vYF;
 		vYG = vYG ^ vYG;
 		vYH = vYH ^ vYH;
 		vYI = vYI ^ vYI;
 		vYJ = vYJ ^ vYJ;
 		vYK = vYK ^ vYK;
 		unsigned char *g0 = &gftbls[0 * 32 * vlen];
 		unsigned char *g1 = &gftbls[1 * 32 * vlen];
 		unsigned char *g2 = &gftbls[2 * 32 * vlen];
 		unsigned char *g3 = &gftbls[3 * 32 * vlen];
 		for (j = 0; j < vlen; j++) {
 			s = (unsigned char *)src[j];
 			vX1 = vec_xl(0, s + i);
 			vX2 = vec_xl(16, s + i);
 			vX3 = vec_xl(32, s + i);
 			vX4 = vec_xl(48, s + i);
 			vlo0 = EC_vec_xl(0, g0);
 			vhi0 = EC_vec_xl(16, g0);
 			vlo1 = EC_vec_xl(0, g1);
 			vhi1 = EC_vec_xl(16, g1);
 			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
 			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
 			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
 			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
 			vlo2 = vec_xl(0, g2);
 			vhi2 = vec_xl(16, g2);
 			vlo3 = vec_xl(0, g3);
 			vhi3 = vec_xl(16, g3);
 			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
 			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
 			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
 			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
 			vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
 			vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
 			vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
 			vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
 			vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
 			vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
 			vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
 			vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
 			g0 += 32;
 			g1 += 32;
 			g2 += 32;
 			g3 += 32;
 		}
 		vec_xst(vY1, 0, t0 + i);
 		vec_xst(vY2, 16, t0 + i);
 		vec_xst(vY3, 0, t1 + i);
 		vec_xst(vY4, 16, t1 + i);
 		vec_xst(vY5, 0, t2 + i);
 		vec_xst(vY6, 16, t2 + i);
 		vec_xst(vY7, 0, t3 + i);
 		vec_xst(vY8, 16, t3 + i);
 		vec_xst(vYD, 32, t0 + i);
 		vec_xst(vYE, 48, t0 + i);
 		vec_xst(vYF, 32, t1 + i);
 		vec_xst(vYG, 48, t1 + i);
 		vec_xst(vYH, 32, t2 + i);
 		vec_xst(vYI, 48, t2 + i);
 		vec_xst(vYJ, 32, t3 + i);
 		vec_xst(vYK, 48, t3 + i);
 	}
 	return;
 }
--- a/erasure_code/ppc64le/gf_4vect_mad_vsx.c
+++ b/erasure_code/ppc64le/gf_4vect_mad_vsx.c
@ -0,0 +1,103 @@
 #include "ec_base_vsx.h"
 void gf_4vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
 		      unsigned char *src, unsigned char **dest)
 {
 	unsigned char *s, *t0, *t1, *t2, *t3;
 	vector unsigned char vX1, vX2, vX3, vX4;
 	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8;
 	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK;
 	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3;
 	int i, head;
 	s = (unsigned char *)src;
 	t0 = (unsigned char *)dest[0];
 	t1 = (unsigned char *)dest[1];
 	t2 = (unsigned char *)dest[2];
 	t3 = (unsigned char *)dest[3];
 	head = len % 64;
 	if (head != 0) {
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
 	}
 	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
 	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
 	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
 	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
 	vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
 	vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
 	vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
 	vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
 	for (i = head; i < len - 63; i += 64) {
 		vX1 = vec_xl(0, s + i);
 		vX2 = vec_xl(16, s + i);
 		vX3 = vec_xl(32, s + i);
 		vX4 = vec_xl(48, s + i);
 		vY1 = vec_xl(0, t0 + i);
 		vY2 = vec_xl(16, t0 + i);
 		vYD = vec_xl(32, t0 + i);
 		vYE = vec_xl(48, t0 + i);
 		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
 		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
 		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
 		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
 		vY3 = vec_xl(0, t1 + i);
 		vY4 = vec_xl(16, t1 + i);
 		vYF = vec_xl(32, t1 + i);
 		vYG = vec_xl(48, t1 + i);
 		vec_xst(vY1, 0, t0 + i);
 		vec_xst(vY2, 16, t0 + i);
 		vec_xst(vYD, 32, t0 + i);
 		vec_xst(vYE, 48, t0 + i);
 		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
 		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
 		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
 		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
 		vY5 = vec_xl(0, t2 + i);
 		vY6 = vec_xl(16, t2 + i);
 		vYH = vec_xl(32, t2 + i);
 		vYI = vec_xl(48, t2 + i);
 		vec_xst(vY3, 0, t1 + i);
 		vec_xst(vY4, 16, t1 + i);
 		vec_xst(vYF, 32, t1 + i);
 		vec_xst(vYG, 48, t1 + i);
 		vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
 		vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
 		vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
 		vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
 		vY7 = vec_xl(0, t3 + i);
 		vY8 = vec_xl(16, t3 + i);
 		vYJ = vec_xl(32, t3 + i);
 		vYK = vec_xl(48, t3 + i);
 		vec_xst(vY5, 0, t2 + i);
 		vec_xst(vY6, 16, t2 + i);
 		vec_xst(vYH, 32, t2 + i);
 		vec_xst(vYI, 48, t2 + i);
 		vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
 		vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
 		vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
 		vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
 		vec_xst(vY7, 0, t3 + i);
 		vec_xst(vY8, 16, t3 + i);
 		vec_xst(vYJ, 32, t3 + i);
 		vec_xst(vYK, 48, t3 + i);
 	}
 	return;
 }
--- a/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c
+++ b/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c
@ -0,0 +1,145 @@
 #include "ec_base_vsx.h"
 void gf_5vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
 			   unsigned char **src, unsigned char **dest)
 {
 	unsigned char *s, *t0, *t1, *t2, *t3, *t4;
 	vector unsigned char vX1, vX2, vX3, vX4;
 	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA;
 	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM;
 	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3, vhi4, vlo4;
 	int i, j, head;
 	if (vlen < 128) {
 		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
 		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
 		gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
 		gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
 		gf_vect_mul_vsx(len, &gftbls[4 * 32 * vlen], src[0], (unsigned char *)dest[4]);
 		for (j = 1; j < vlen; j++) {
 			gf_5vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
 		}
 		return;
 	}
 	t0 = (unsigned char *)dest[0];
 	t1 = (unsigned char *)dest[1];
 	t2 = (unsigned char *)dest[2];
 	t3 = (unsigned char *)dest[3];
 	t4 = (unsigned char *)dest[4];
 	head = len % 64;
 	if (head != 0) {
 		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[4 * 32 * vlen], src, t4);
 	}
 	for (i = head; i < len - 63; i += 64) {
 		vY1 = vY1 ^ vY1;
 		vY2 = vY2 ^ vY2;
 		vY3 = vY3 ^ vY3;
 		vY4 = vY4 ^ vY4;
 		vY5 = vY5 ^ vY5;
 		vY6 = vY6 ^ vY6;
 		vY7 = vY7 ^ vY7;
 		vY8 = vY8 ^ vY8;
 		vY9 = vY9 ^ vY9;
 		vYA = vYA ^ vYA;
 		vYD = vYD ^ vYD;
 		vYE = vYE ^ vYE;
 		vYF = vYF ^ vYF;
 		vYG = vYG ^ vYG;
 		vYH = vYH ^ vYH;
 		vYI = vYI ^ vYI;
 		vYJ = vYJ ^ vYJ;
 		vYK = vYK ^ vYK;
 		vYL = vYL ^ vYL;
 		vYM = vYM ^ vYM;
 		unsigned char *g0 = &gftbls[0 * 32 * vlen];
 		unsigned char *g1 = &gftbls[1 * 32 * vlen];
 		unsigned char *g2 = &gftbls[2 * 32 * vlen];
 		unsigned char *g3 = &gftbls[3 * 32 * vlen];
 		unsigned char *g4 = &gftbls[4 * 32 * vlen];
 		for (j = 0; j < vlen; j++) {
 			s = (unsigned char *)src[j];
 			vX1 = vec_xl(0, s + i);
 			vX2 = vec_xl(16, s + i);
 			vX3 = vec_xl(32, s + i);
 			vX4 = vec_xl(48, s + i);
 			vlo0 = EC_vec_xl(0, g0);
 			vhi0 = EC_vec_xl(16, g0);
 			vlo1 = EC_vec_xl(0, g1);
 			vhi1 = EC_vec_xl(16, g1);
 			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
 			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
 			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
 			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
 			vlo2 = vec_xl(0, g2);
 			vhi2 = vec_xl(16, g2);
 			vlo3 = vec_xl(0, g3);
 			vhi3 = vec_xl(16, g3);
 			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
 			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
 			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
 			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
 			vlo4 = vec_xl(0, g4);
 			vhi4 = vec_xl(16, g4);
 			vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
 			vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
 			vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
 			vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
 			vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
 			vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
 			vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
 			vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
 			vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
 			vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
 			vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
 			vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
 			g0 += 32;
 			g1 += 32;
 			g2 += 32;
 			g3 += 32;
 			g4 += 32;
 		}
 		vec_xst(vY1, 0, t0 + i);
 		vec_xst(vY2, 16, t0 + i);
 		vec_xst(vY3, 0, t1 + i);
 		vec_xst(vY4, 16, t1 + i);
 		vec_xst(vY5, 0, t2 + i);
 		vec_xst(vY6, 16, t2 + i);
 		vec_xst(vY7, 0, t3 + i);
 		vec_xst(vY8, 16, t3 + i);
 		vec_xst(vY9, 0, t4 + i);
 		vec_xst(vYA, 16, t4 + i);
 		vec_xst(vYD, 32, t0 + i);
 		vec_xst(vYE, 48, t0 + i);
 		vec_xst(vYF, 32, t1 + i);
 		vec_xst(vYG, 48, t1 + i);
 		vec_xst(vYH, 32, t2 + i);
 		vec_xst(vYI, 48, t2 + i);
 		vec_xst(vYJ, 32, t3 + i);
 		vec_xst(vYK, 48, t3 + i);
 		vec_xst(vYL, 32, t4 + i);
 		vec_xst(vYM, 48, t4 + i);
 	}
 	return;
 }
--- a/erasure_code/ppc64le/gf_5vect_mad_vsx.c
+++ b/erasure_code/ppc64le/gf_5vect_mad_vsx.c
@ -0,0 +1,122 @@
 #include "ec_base_vsx.h"
 void gf_5vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
 		      unsigned char *src, unsigned char **dest)
 {
 	unsigned char *s, *t0, *t1, *t2, *t3, *t4;
 	vector unsigned char vX1, vX2, vX3, vX4;
 	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA;
 	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM;
 	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3, vhi4, vlo4;
 	int i, head;
 	s = (unsigned char *)src;
 	t0 = (unsigned char *)dest[0];
 	t1 = (unsigned char *)dest[1];
 	t2 = (unsigned char *)dest[2];
 	t3 = (unsigned char *)dest[3];
 	t4 = (unsigned char *)dest[4];
 	head = len % 64;
 	if (head != 0) {
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[4 * 32 * vec], src, t4);
 	}
 	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
 	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
 	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
 	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
 	vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
 	vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
 	vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
 	vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
 	vlo4 = EC_vec_xl(0, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
 	vhi4 = EC_vec_xl(16, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
 	for (i = head; i < len - 63; i += 64) {
 		vX1 = vec_xl(0, s + i);
 		vX2 = vec_xl(16, s + i);
 		vX3 = vec_xl(32, s + i);
 		vX4 = vec_xl(48, s + i);
 		vY1 = vec_xl(0, t0 + i);
 		vY2 = vec_xl(16, t0 + i);
 		vYD = vec_xl(32, t0 + i);
 		vYE = vec_xl(48, t0 + i);
 		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
 		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
 		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
 		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
 		vY3 = vec_xl(0, t1 + i);
 		vY4 = vec_xl(16, t1 + i);
 		vYF = vec_xl(32, t1 + i);
 		vYG = vec_xl(48, t1 + i);
 		vec_xst(vY1, 0, t0 + i);
 		vec_xst(vY2, 16, t0 + i);
 		vec_xst(vYD, 32, t0 + i);
 		vec_xst(vYE, 48, t0 + i);
 		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
 		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
 		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
 		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
 		vY5 = vec_xl(0, t2 + i);
 		vY6 = vec_xl(16, t2 + i);
 		vYH = vec_xl(32, t2 + i);
 		vYI = vec_xl(48, t2 + i);
 		vec_xst(vY3, 0, t1 + i);
 		vec_xst(vY4, 16, t1 + i);
 		vec_xst(vYF, 32, t1 + i);
 		vec_xst(vYG, 48, t1 + i);
 		vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
 		vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
 		vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
 		vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
 		vY7 = vec_xl(0, t3 + i);
 		vY8 = vec_xl(16, t3 + i);
 		vYJ = vec_xl(32, t3 + i);
 		vYK = vec_xl(48, t3 + i);
 		vec_xst(vY5, 0, t2 + i);
 		vec_xst(vY6, 16, t2 + i);
 		vec_xst(vYH, 32, t2 + i);
 		vec_xst(vYI, 48, t2 + i);
 		vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
 		vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
 		vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
 		vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
 		vY9 = vec_xl(0, t4 + i);
 		vYA = vec_xl(16, t4 + i);
 		vYL = vec_xl(32, t4 + i);
 		vYM = vec_xl(48, t4 + i);
 		vec_xst(vY7, 0, t3 + i);
 		vec_xst(vY8, 16, t3 + i);
 		vec_xst(vYJ, 32, t3 + i);
 		vec_xst(vYK, 48, t3 + i);
 		vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
 		vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
 		vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
 		vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
 		vec_xst(vY9, 0, t4 + i);
 		vec_xst(vYA, 16, t4 + i);
 		vec_xst(vYL, 32, t4 + i);
 		vec_xst(vYM, 48, t4 + i);
 	}
 	return;
 }
--- a/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c
+++ b/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c
@ -0,0 +1,166 @@
 #include "ec_base_vsx.h"
 void gf_6vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
 			   unsigned char **src, unsigned char **dest)
 {
 	unsigned char *s, *t0, *t1, *t2, *t3, *t4, *t5;
 	vector unsigned char vX1, vX2, vX3, vX4;
 	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA, vYB, vYC;
 	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM, vYN, vYO;
 	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
 	vector unsigned char vhi3, vlo3, vhi4, vlo4, vhi5, vlo5;
 	int i, j, head;
 	if (vlen < 128) {
 		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
 		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
 		gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
 		gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
 		gf_vect_mul_vsx(len, &gftbls[4 * 32 * vlen], src[0], (unsigned char *)dest[4]);
 		gf_vect_mul_vsx(len, &gftbls[5 * 32 * vlen], src[0], (unsigned char *)dest[5]);
 		for (j = 1; j < vlen; j++) {
 			gf_6vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
 		}
 		return;
 	}
 	t0 = (unsigned char *)dest[0];
 	t1 = (unsigned char *)dest[1];
 	t2 = (unsigned char *)dest[2];
 	t3 = (unsigned char *)dest[3];
 	t4 = (unsigned char *)dest[4];
 	t5 = (unsigned char *)dest[5];
 	head = len % 64;
 	if (head != 0) {
 		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[4 * 32 * vlen], src, t4);
 		gf_vect_dot_prod_base(head, vlen, &gftbls[5 * 32 * vlen], src, t5);
 	}
 	for (i = head; i < len - 63; i += 64) {
 		vY1 = vY1 ^ vY1;
 		vY2 = vY2 ^ vY2;
 		vY3 = vY3 ^ vY3;
 		vY4 = vY4 ^ vY4;
 		vY5 = vY5 ^ vY5;
 		vY6 = vY6 ^ vY6;
 		vY7 = vY7 ^ vY7;
 		vY8 = vY8 ^ vY8;
 		vY9 = vY9 ^ vY9;
 		vYA = vYA ^ vYA;
 		vYB = vYB ^ vYB;
 		vYC = vYC ^ vYC;
 		vYD = vYD ^ vYD;
 		vYE = vYE ^ vYE;
 		vYF = vYF ^ vYF;
 		vYG = vYG ^ vYG;
 		vYH = vYH ^ vYH;
 		vYI = vYI ^ vYI;
 		vYJ = vYJ ^ vYJ;
 		vYK = vYK ^ vYK;
 		vYL = vYL ^ vYL;
 		vYM = vYM ^ vYM;
 		vYN = vYN ^ vYN;
 		vYO = vYO ^ vYO;
 		unsigned char *g0 = &gftbls[0 * 32 * vlen];
 		unsigned char *g1 = &gftbls[1 * 32 * vlen];
 		unsigned char *g2 = &gftbls[2 * 32 * vlen];
 		unsigned char *g3 = &gftbls[3 * 32 * vlen];
 		unsigned char *g4 = &gftbls[4 * 32 * vlen];
 		unsigned char *g5 = &gftbls[5 * 32 * vlen];
 		for (j = 0; j < vlen; j++) {
 			s = (unsigned char *)src[j];
 			vX1 = vec_xl(0, s + i);
 			vX2 = vec_xl(16, s + i);
 			vX3 = vec_xl(32, s + i);
 			vX4 = vec_xl(48, s + i);
 			vlo0 = EC_vec_xl(0, g0);
 			vhi0 = EC_vec_xl(16, g0);
 			vlo1 = EC_vec_xl(0, g1);
 			vhi1 = EC_vec_xl(16, g1);
 			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
 			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
 			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
 			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
 			vlo2 = EC_vec_xl(0, g2);
 			vhi2 = EC_vec_xl(16, g2);
 			vlo3 = EC_vec_xl(0, g3);
 			vhi3 = EC_vec_xl(16, g3);
 			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
 			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
 			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
 			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
 			vlo4 = EC_vec_xl(0, g4);
 			vhi4 = EC_vec_xl(16, g4);
 			vlo5 = EC_vec_xl(0, g5);
 			vhi5 = EC_vec_xl(16, g5);
 			vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
 			vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
 			vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
 			vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
 			vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
 			vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
 			vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
 			vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
 			vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
 			vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
 			vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
 			vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
 			vYB = vYB ^ EC_vec_permxor(vhi5, vlo5, vX1);
 			vYC = vYC ^ EC_vec_permxor(vhi5, vlo5, vX2);
 			vYN = vYN ^ EC_vec_permxor(vhi5, vlo5, vX3);
 			vYO = vYO ^ EC_vec_permxor(vhi5, vlo5, vX4);
 			g0 += 32;
 			g1 += 32;
 			g2 += 32;
 			g3 += 32;
 			g4 += 32;
 			g5 += 32;
 		}
 		vec_xst(vY1, 0, t0 + i);
 		vec_xst(vY2, 16, t0 + i);
 		vec_xst(vY3, 0, t1 + i);
 		vec_xst(vY4, 16, t1 + i);
 		vec_xst(vY5, 0, t2 + i);
 		vec_xst(vY6, 16, t2 + i);
 		vec_xst(vY7, 0, t3 + i);
 		vec_xst(vY8, 16, t3 + i);
 		vec_xst(vY9, 0, t4 + i);
 		vec_xst(vYA, 16, t4 + i);
 		vec_xst(vYB, 0, t5 + i);
 		vec_xst(vYC, 16, t5 + i);
 		vec_xst(vYD, 32, t0 + i);
 		vec_xst(vYE, 48, t0 + i);
 		vec_xst(vYF, 32, t1 + i);
 		vec_xst(vYG, 48, t1 + i);
 		vec_xst(vYH, 32, t2 + i);
 		vec_xst(vYI, 48, t2 + i);
 		vec_xst(vYJ, 32, t3 + i);
 		vec_xst(vYK, 48, t3 + i);
 		vec_xst(vYL, 32, t4 + i);
 		vec_xst(vYM, 48, t4 + i);
 		vec_xst(vYN, 32, t5 + i);
 		vec_xst(vYO, 48, t5 + i);
 	}
 	return;
 }
--- a/erasure_code/ppc64le/gf_6vect_mad_vsx.c
+++ b/erasure_code/ppc64le/gf_6vect_mad_vsx.c
@ -0,0 +1,142 @@
 #include "ec_base_vsx.h"
 void gf_6vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
 		      unsigned char *src, unsigned char **dest)
 {
 	unsigned char *s, *t0, *t1, *t2, *t3, *t4, *t5;
 	vector unsigned char vX1, vX2, vX3, vX4;
 	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA, vYB, vYC;
 	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM, vYN, vYO;
 	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
 	vector unsigned char vhi3, vlo3, vhi4, vlo4, vhi5, vlo5;
 	int i, head;
 	s = (unsigned char *)src;
 	t0 = (unsigned char *)dest[0];
 	t1 = (unsigned char *)dest[1];
 	t2 = (unsigned char *)dest[2];
 	t3 = (unsigned char *)dest[3];
 	t4 = (unsigned char *)dest[4];
 	t5 = (unsigned char *)dest[5];
 	head = len % 64;
 	if (head != 0) {
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[4 * 32 * vec], src, t4);
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[5 * 32 * vec], src, t5);
 	}
 	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
 	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
 	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
 	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
 	vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
 	vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
 	vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
 	vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
 	vlo4 = EC_vec_xl(0, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
 	vhi4 = EC_vec_xl(16, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
 	vlo5 = EC_vec_xl(0, gftbls + (((5 * vec) << 5) + (vec_i << 5)));
 	vhi5 = EC_vec_xl(16, gftbls + (((5 * vec) << 5) + (vec_i << 5)));
 	for (i = head; i < len - 63; i += 64) {
 		vX1 = vec_xl(0, s + i);
 		vX2 = vec_xl(16, s + i);
 		vX3 = vec_xl(32, s + i);
 		vX4 = vec_xl(48, s + i);
 		vY1 = vec_xl(0, t0 + i);
 		vY2 = vec_xl(16, t0 + i);
 		vYD = vec_xl(32, t0 + i);
 		vYE = vec_xl(48, t0 + i);
 		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
 		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
 		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
 		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
 		vec_xst(vY1, 0, t0 + i);
 		vec_xst(vY2, 16, t0 + i);
 		vec_xst(vYD, 32, t0 + i);
 		vec_xst(vYE, 48, t0 + i);
 		vY3 = vec_xl(0, t1 + i);
 		vY4 = vec_xl(16, t1 + i);
 		vYF = vec_xl(32, t1 + i);
 		vYG = vec_xl(48, t1 + i);
 		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
 		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
 		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
 		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
 		vec_xst(vY3, 0, t1 + i);
 		vec_xst(vY4, 16, t1 + i);
 		vec_xst(vYF, 32, t1 + i);
 		vec_xst(vYG, 48, t1 + i);
 		vY5 = vec_xl(0, t2 + i);
 		vY6 = vec_xl(16, t2 + i);
 		vYH = vec_xl(32, t2 + i);
 		vYI = vec_xl(48, t2 + i);
 		vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
 		vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
 		vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
 		vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
 		vY7 = vec_xl(0, t3 + i);
 		vY8 = vec_xl(16, t3 + i);
 		vYJ = vec_xl(32, t3 + i);
 		vYK = vec_xl(48, t3 + i);
 		vec_xst(vY5, 0, t2 + i);
 		vec_xst(vY6, 16, t2 + i);
 		vec_xst(vYH, 32, t2 + i);
 		vec_xst(vYI, 48, t2 + i);
 		vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
 		vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
 		vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
 		vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
 		vY9 = vec_xl(0, t4 + i);
 		vYA = vec_xl(16, t4 + i);
 		vYL = vec_xl(32, t4 + i);
 		vYM = vec_xl(48, t4 + i);
 		vec_xst(vY7, 0, t3 + i);
 		vec_xst(vY8, 16, t3 + i);
 		vec_xst(vYJ, 32, t3 + i);
 		vec_xst(vYK, 48, t3 + i);
 		vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
 		vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
 		vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
 		vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
 		vYB = vec_xl(0, t5 + i);
 		vYC = vec_xl(16, t5 + i);
 		vYN = vec_xl(32, t5 + i);
 		vYO = vec_xl(48, t5 + i);
 		vec_xst(vY9, 0, t4 + i);
 		vec_xst(vYA, 16, t4 + i);
 		vec_xst(vYL, 32, t4 + i);
 		vec_xst(vYM, 48, t4 + i);
 		vYB = vYB ^ EC_vec_permxor(vhi5, vlo5, vX1);
 		vYC = vYC ^ EC_vec_permxor(vhi5, vlo5, vX2);
 		vYN = vYN ^ EC_vec_permxor(vhi5, vlo5, vX3);
 		vYO = vYO ^ EC_vec_permxor(vhi5, vlo5, vX4);
 		vec_xst(vYB, 0, t5 + i);
 		vec_xst(vYC, 16, t5 + i);
 		vec_xst(vYN, 32, t5 + i);
 		vec_xst(vYO, 48, t5 + i);
 	}
 	return;
 }
--- a/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c
+++ b/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c
@ -0,0 +1,85 @@
 #include "ec_base_vsx.h"
 void gf_vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
 			  unsigned char **src, unsigned char *dest)
 {
 	unsigned char *s, *t0;
 	vector unsigned char vX1, vY1;
 	vector unsigned char vX2, vY2;
 	vector unsigned char vX3, vY3;
 	vector unsigned char vX4, vY4;
 	vector unsigned char vX5, vY5;
 	vector unsigned char vX6, vY6;
 	vector unsigned char vX7, vY7;
 	vector unsigned char vX8, vY8;
 	vector unsigned char vhi0, vlo0;
 	int i, j, head;
 	if (vlen < 128) {
 		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest);
 		for (j = 1; j < vlen; j++) {
 			gf_vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
 		}
 		return;
 	}
 	t0 = (unsigned char *)dest;
 	head = len % 128;
 	if (head != 0) {
 		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
 	}
 	for (i = head; i < len - 127; i += 128) {
 		vY1 = vY1 ^ vY1;
 		vY2 = vY2 ^ vY2;
 		vY3 = vY3 ^ vY3;
 		vY4 = vY4 ^ vY4;
 		vY5 = vY5 ^ vY5;
 		vY6 = vY6 ^ vY6;
 		vY7 = vY7 ^ vY7;
 		vY8 = vY8 ^ vY8;
 		unsigned char *g0 = &gftbls[0 * 32 * vlen];
 		for (j = 0; j < vlen; j++) {
 			s = (unsigned char *)src[j];
 			vX1 = vec_xl(0, s + i);
 			vX2 = vec_xl(16, s + i);
 			vX3 = vec_xl(32, s + i);
 			vX4 = vec_xl(48, s + i);
 			vlo0 = EC_vec_xl(0, g0);
 			vhi0 = EC_vec_xl(16, g0);
 			vX5 = vec_xl(64, s + i);
 			vX6 = vec_xl(80, s + i);
 			vX7 = vec_xl(96, s + i);
 			vX8 = vec_xl(112, s + i);
 			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
 			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
 			vY3 = vY3 ^ EC_vec_permxor(vhi0, vlo0, vX3);
 			vY4 = vY4 ^ EC_vec_permxor(vhi0, vlo0, vX4);
 			vY5 = vY5 ^ EC_vec_permxor(vhi0, vlo0, vX5);
 			vY6 = vY6 ^ EC_vec_permxor(vhi0, vlo0, vX6);
 			vY7 = vY7 ^ EC_vec_permxor(vhi0, vlo0, vX7);
 			vY8 = vY8 ^ EC_vec_permxor(vhi0, vlo0, vX8);
 			g0 += 32;
 		}
 		vec_xst(vY1, 0, t0 + i);
 		vec_xst(vY2, 16, t0 + i);
 		vec_xst(vY3, 32, t0 + i);
 		vec_xst(vY4, 48, t0 + i);
 		vec_xst(vY5, 64, t0 + i);
 		vec_xst(vY6, 80, t0 + i);
 		vec_xst(vY7, 96, t0 + i);
 		vec_xst(vY8, 112, t0 + i);
 	}
 	return;
 }
--- a/erasure_code/ppc64le/gf_vect_mad_vsx.c
+++ b/erasure_code/ppc64le/gf_vect_mad_vsx.c
@ -0,0 +1,48 @@
 #include "ec_base_vsx.h"
 void gf_vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
 		     unsigned char *src, unsigned char *dest)
 {
 	unsigned char *s, *t0;
 	vector unsigned char vX1, vY1;
 	vector unsigned char vX2, vY2;
 	vector unsigned char vX3, vY3;
 	vector unsigned char vX4, vY4;
 	vector unsigned char vhi0, vlo0;
 	int i, head;
 	s = (unsigned char *)src;
 	t0 = (unsigned char *)dest;
 	head = len % 64;
 	if (head != 0) {
 		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, dest);
 	}
 	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
 	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
 	for (i = head; i < len - 63; i += 64) {
 		vX1 = vec_xl(0, s + i);
 		vX2 = vec_xl(16, s + i);
 		vX3 = vec_xl(32, s + i);
 		vX4 = vec_xl(48, s + i);
 		vY1 = vec_xl(0, t0 + i);
 		vY2 = vec_xl(16, t0 + i);
 		vY3 = vec_xl(32, t0 + i);
 		vY4 = vec_xl(48, t0 + i);
 		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
 		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
 		vY3 = vY3 ^ EC_vec_permxor(vhi0, vlo0, vX3);
 		vY4 = vY4 ^ EC_vec_permxor(vhi0, vlo0, vX4);
 		vec_xst(vY1, 0, t0 + i);
 		vec_xst(vY2, 16, t0 + i);
 		vec_xst(vY3, 32, t0 + i);
 		vec_xst(vY4, 48, t0 + i);
 	}
 	return;
 }
--- a/erasure_code/ppc64le/gf_vect_mul_vsx.c
+++ b/erasure_code/ppc64le/gf_vect_mul_vsx.c
@ -0,0 +1,61 @@
 #include "ec_base_vsx.h"
 void gf_vect_mul_vsx(int len, unsigned char *gftbl, unsigned char *src, unsigned char *dest)
 {
 	unsigned char *s, *t0;
 	vector unsigned char vX1, vY1;
 	vector unsigned char vX2, vY2;
 	vector unsigned char vX3, vY3;
 	vector unsigned char vX4, vY4;
 	vector unsigned char vX5, vY5;
 	vector unsigned char vX6, vY6;
 	vector unsigned char vX7, vY7;
 	vector unsigned char vX8, vY8;
 	vector unsigned char vhi0, vlo0;
 	int i, head;
 	s = (unsigned char *)src;
 	t0 = (unsigned char *)dest;
 	head = len % 128;
 	if (head != 0) {
 		gf_vect_mul_base(head, gftbl, src, dest);
 	}
 	vlo0 = EC_vec_xl(0, gftbl);
 	vhi0 = EC_vec_xl(16, gftbl);
 	for (i = head; i < len - 127; i += 128) {
 		vX1 = vec_xl(0, s + i);
 		vX2 = vec_xl(16, s + i);
 		vX3 = vec_xl(32, s + i);
 		vX4 = vec_xl(48, s + i);
 		vX5 = vec_xl(64, s + i);
 		vX6 = vec_xl(80, s + i);
 		vX7 = vec_xl(96, s + i);
 		vX8 = vec_xl(112, s + i);
 		vY1 = EC_vec_permxor(vhi0, vlo0, vX1);
 		vY2 = EC_vec_permxor(vhi0, vlo0, vX2);
 		vY3 = EC_vec_permxor(vhi0, vlo0, vX3);
 		vY4 = EC_vec_permxor(vhi0, vlo0, vX4);
 		vY5 = EC_vec_permxor(vhi0, vlo0, vX5);
 		vY6 = EC_vec_permxor(vhi0, vlo0, vX6);
 		vY7 = EC_vec_permxor(vhi0, vlo0, vX7);
 		vY8 = EC_vec_permxor(vhi0, vlo0, vX8);
 		vec_xst(vY1, 0, t0 + i);
 		vec_xst(vY2, 16, t0 + i);
 		vec_xst(vY3, 32, t0 + i);
 		vec_xst(vY4, 48, t0 + i);
 		vec_xst(vY5, 64, t0 + i);
 		vec_xst(vY6, 80, t0 + i);
 		vec_xst(vY7, 96, t0 + i);
 		vec_xst(vY8, 112, t0 + i);
 	}
 	return;
 }
--- a/igzip/Makefile.am
+++ b/igzip/Makefile.am
@ -38,6 +38,7 @@ lsrc        += 	igzip/igzip.c \
 lsrc_base_aliases += igzip/igzip_base_aliases.c igzip/proc_heap_base.c
 lsrc_x86_32       += igzip/igzip_base_aliases.c igzip/proc_heap_base.c
 lsrc_ppc64le      += igzip/igzip_base_aliases.c igzip/proc_heap_base.c
 lsrc_aarch64 +=	igzip/aarch64/igzip_inflate_multibinary_arm64.S  \
 		igzip/aarch64/igzip_multibinary_arm64.S	\
--- a/mem/Makefile.am
+++ b/mem/Makefile.am
@ -32,6 +32,7 @@ include mem/aarch64/Makefile.am
 lsrc        += 	mem/mem_zero_detect_base.c
 lsrc_base_aliases += mem/mem_zero_detect_base_aliases.c
 lsrc_ppc64le      += mem/mem_zero_detect_base_aliases.c
 lsrc_x86_64 += 	mem/mem_zero_detect_avx.asm \
 		mem/mem_zero_detect_sse.asm \
--- a/raid/Makefile.am
+++ b/raid/Makefile.am
@ -32,6 +32,7 @@ include raid/aarch64/Makefile.am
 lsrc        += 	raid/raid_base.c
 lsrc_base_aliases += raid/raid_base_aliases.c
 lsrc_ppc64le      += raid/raid_base_aliases.c
 lsrc_x86_64 += \
 		raid/xor_gen_sse.asm \