enable VSX SIMD in ISA-L for ppc64le

1) Implement the ErasureCode function in Altivec Intrinsics 2) Coding style update Change-Id: I2c81d035f4083e9b011dbf3b741f628813b68606 Thanks-to: Daniel Axtens <dja@axtens.net> Signed-off-by: Hong Bo Peng <penghb@cn.ibm.com>
2025-09-18 19:09:32 +02:00 · 2020-02-20 11:47:53 +08:00 · 2020-02-20 11:47:53 +08:00 · 180c74aefd
commit 180c74aefd
parent a3d5cd8642
23 changed files with 1798 additions and 0 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -27,9 +27,11 @@ other_tests=
 other_tests_x86_64=
 other_tests_x86_32=
 other_tests_aarch64=
+other_tests_ppc64le=
 lsrc_x86_64=
 lsrc_x86_32=
 lsrc_aarch64=
+lsrc_ppc64le=
 lsrc_base_aliases=
 lsrc32=
 unit_tests32=
@ -71,6 +73,11 @@ libisal_la_SOURCES += ${lsrc_aarch64}
 other_tests += ${other_tests_aarch64}
 endif

+if CPU_PPC64LE
+libisal_la_SOURCES += ${lsrc_ppc64le}
+other_tests += ${other_tests_ppc64le}
+endif
+
 if CPU_UNDEFINED
 libisal_la_SOURCES += ${lsrc_base_aliases}
 endif
--- a/configure.ac
+++ b/configure.ac
@ -30,10 +30,13 @@ AS_CASE([$host_cpu],
  [i?86], [CPU="x86_32"],
  [aarch64], [CPU="aarch64"],
  [arm64], [CPU="aarch64"],
+  [powerpc64le], [CPU="ppc64le"],
+  [ppc64le], [CPU="ppc64le"],
 )
 AM_CONDITIONAL([CPU_X86_64], [test "$CPU" = "x86_64"])
 AM_CONDITIONAL([CPU_X86_32], [test "$CPU" = "x86_32"])
 AM_CONDITIONAL([CPU_AARCH64], [test "$CPU" = "aarch64"])
+AM_CONDITIONAL([CPU_PPC64LE], [test "$CPU" = "ppc64le"])
 AM_CONDITIONAL([CPU_UNDEFINED], [test "x$CPU" = "x"])

 if test "$CPU" = "x86_64"; then
--- a/crc/Makefile.am
+++ b/crc/Makefile.am
@ -35,6 +35,7 @@ lsrc  += \

 lsrc_base_aliases += crc/crc_base_aliases.c
 lsrc_x86_32       += crc/crc_base_aliases.c
+lsrc_ppc64le      += crc/crc_base_aliases.c

 lsrc_x86_64 += \
 	crc/crc16_t10dif_01.asm \
--- a/erasure_code/Makefile.am
+++ b/erasure_code/Makefile.am
@ -29,6 +29,8 @@

 include erasure_code/aarch64/Makefile.am

+include erasure_code/ppc64le/Makefile.am
+
 lsrc         += erasure_code/ec_base.c

 lsrc_base_aliases += erasure_code/ec_base_aliases.c
--- a/erasure_code/ppc64le/Makefile.am
+++ b/erasure_code/ppc64le/Makefile.am
@ -0,0 +1,15 @@
+lsrc_ppc64le      += erasure_code/ppc64le/ec_base_vsx.c \
+		erasure_code/ppc64le/gf_vect_mul_vsx.c \
+		erasure_code/ppc64le/gf_vect_dot_prod_vsx.c \
+		erasure_code/ppc64le/gf_vect_mad_vsx.c \
+		erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c \
+		erasure_code/ppc64le/gf_2vect_mad_vsx.c \
+		erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c \
+		erasure_code/ppc64le/gf_3vect_mad_vsx.c \
+		erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c \
+		erasure_code/ppc64le/gf_4vect_mad_vsx.c \
+		erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c \
+		erasure_code/ppc64le/gf_5vect_mad_vsx.c \
+		erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c \
+		erasure_code/ppc64le/gf_6vect_mad_vsx.c
+
--- a/erasure_code/ppc64le/ec_base_vsx.c
+++ b/erasure_code/ppc64le/ec_base_vsx.c
@ -0,0 +1,97 @@
+#include "erasure_code.h"
+#include "ec_base_vsx.h"
+
+void gf_vect_dot_prod(int len, int vlen, unsigned char *v,
+		      unsigned char **src, unsigned char *dest)
+{
+	gf_vect_dot_prod_vsx(len, vlen, v, src, dest);
+}
+
+void gf_vect_mad(int len, int vec, int vec_i, unsigned char *v,
+		 unsigned char *src, unsigned char *dest)
+{
+	gf_vect_mad_vsx(len, vec, vec_i, v, src, dest);
+
+}
+
+void ec_encode_data(int len, int srcs, int dests, unsigned char *v,
+		    unsigned char **src, unsigned char **dest)
+{
+	if (len < 64) {
+		ec_encode_data_base(len, srcs, dests, v, src, dest);
+		return;
+	}
+
+	while (dests >= 6) {
+		gf_6vect_dot_prod_vsx(len, srcs, v, src, dest);
+		v += 6 * srcs * 32;
+		dest += 6;
+		dests -= 6;
+	}
+	switch (dests) {
+	case 6:
+		gf_6vect_dot_prod_vsx(len, srcs, v, src, dest);
+		break;
+	case 5:
+		gf_5vect_dot_prod_vsx(len, srcs, v, src, dest);
+		break;
+	case 4:
+		gf_4vect_dot_prod_vsx(len, srcs, v, src, dest);
+		break;
+	case 3:
+		gf_3vect_dot_prod_vsx(len, srcs, v, src, dest);
+		break;
+	case 2:
+		gf_2vect_dot_prod_vsx(len, srcs, v, src, dest);
+		break;
+	case 1:
+		gf_vect_dot_prod_vsx(len, srcs, v, src, *dest);
+		break;
+	case 0:
+		break;
+	}
+}
+
+void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *v,
+			   unsigned char *data, unsigned char **dest)
+{
+	if (len < 64) {
+		ec_encode_data_update_base(len, k, rows, vec_i, v, data, dest);
+		return;
+	}
+
+	while (rows >= 6) {
+		gf_6vect_mad_vsx(len, k, vec_i, v, data, dest);
+		v += 6 * k * 32;
+		dest += 6;
+		rows -= 6;
+	}
+	switch (rows) {
+	case 6:
+		gf_6vect_mad_vsx(len, k, vec_i, v, data, dest);
+		break;
+	case 5:
+		gf_5vect_mad_vsx(len, k, vec_i, v, data, dest);
+		break;
+	case 4:
+		gf_4vect_mad_vsx(len, k, vec_i, v, data, dest);
+		break;
+	case 3:
+		gf_3vect_mad_vsx(len, k, vec_i, v, data, dest);
+		break;
+	case 2:
+		gf_2vect_mad_vsx(len, k, vec_i, v, data, dest);
+		break;
+	case 1:
+		gf_vect_mad_vsx(len, k, vec_i, v, data, *dest);
+		break;
+	case 0:
+		break;
+	}
+}
+
+int gf_vect_mul(int len, unsigned char *a, void *src, void *dest)
+{
+	gf_vect_mul_vsx(len, a, (unsigned char *)src, (unsigned char *)dest);
+	return 0;
+}
--- a/erasure_code/ppc64le/ec_base_vsx.h
+++ b/erasure_code/ppc64le/ec_base_vsx.h
@ -0,0 +1,338 @@
+#ifndef _ERASURE_CODE_PPC64LE_H_
+#define _ERASURE_CODE_PPC64LE_H_
+
+#include "erasure_code.h"
+#include <altivec.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__ibmxl__)
+#define EC_vec_xl(a, b) vec_xl_be(a, b)
+#define EC_vec_permxor(va, vb, vc) __vpermxor(va, vb, vc)
+#elif defined __GNUC__ && __GNUC__ >= 8
+#define EC_vec_xl(a, b) vec_xl_be(a, b)
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vc)
+#elif defined __GNUC__ && __GNUC__ >= 7
+#if defined _ARCH_PWR9
+#define EC_vec_xl(a, b) vec_vsx_ld(a, b)
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
+#else
+inline vector unsigned char EC_vec_xl(int off, unsigned char *ptr) {
+	vector unsigned char vc;
+	__asm__ __volatile__("lxvd2x %x0, %1, %2; xxswapd %x0, %x0" : "=wa" (vc) : "r" (off), "r" (ptr));
+	return vc;
+}
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
+#endif
+#else
+#if defined _ARCH_PWR8
+inline vector unsigned char EC_vec_xl(int off, unsigned char *ptr) {
+	vector unsigned char vc;
+	__asm__ __volatile__("lxvd2x %x0, %1, %2; xxswapd %x0, %x0" : "=wa" (vc) : "r" (off), "r" (ptr));
+	return vc;
+}
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
+#else
+#error "This code is only supported on ppc64le."
+#endif
+#endif
+
+/**
+ * @brief GF(2^8) vector multiply. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and save to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32 byte constant array based on the input
+ * coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mul_vsx(int len, unsigned char *gftbls, unsigned char *src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product. VSX version.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ *               on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			  unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product with two outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate two outputs at a time. Does two
+ * GF(2^8) dot products across each byte of the input array and two constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 2*32*vlen byte constant array based on the two sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_2vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with three outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate three outputs at a time. Does three
+ * GF(2^8) dot products across each byte of the input array and three constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 3*32*vlen byte constant array based on the three sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_3vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with four outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate four outputs at a time. Does four
+ * GF(2^8) dot products across each byte of the input array and four constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 4*32*vlen byte constant array based on the four sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_4vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with five outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate five outputs at a time. Does five
+ * GF(2^8) dot products across each byte of the input array and five constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 5*32*vlen byte constant array based on the five sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes. Must >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_5vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with six outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate six outputs at a time. Does six
+ * GF(2^8) dot products across each byte of the input array and six constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 6*32*vlen byte constant array based on the six sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_6vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and add to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32*vec byte constant array based on the input
+ * coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		     unsigned char *dest);
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_2vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_3vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_4vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+void gf_5vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+void gf_6vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ERASURE_CODE_PPC64LE_H_
--- a/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c
+++ b/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c
@ -0,0 +1,83 @@
+#include "ec_base_vsx.h"
+
+void gf_2vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4;
+	vector unsigned char vYD, vYE, vYF, vYG;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1;
+	int i, j, head;
+
+	if (vlen < 128) {
+		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+
+		for (j = 1; j < vlen; j++) {
+			gf_2vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+		}
+		return;
+	}
+
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+	}
+
+	for (i = head; i < len - 63; i += 64) {
+		vY1 = vY1 ^ vY1;
+		vY2 = vY2 ^ vY2;
+		vY3 = vY3 ^ vY3;
+		vY4 = vY4 ^ vY4;
+
+		vYD = vYD ^ vYD;
+		vYE = vYE ^ vYE;
+		vYF = vYF ^ vYF;
+		vYG = vYG ^ vYG;
+
+		unsigned char *g0 = &gftbls[0 * 32 * vlen];
+		unsigned char *g1 = &gftbls[1 * 32 * vlen];
+
+		for (j = 0; j < vlen; j++) {
+			s = (unsigned char *)src[j];
+			vX1 = vec_xl(0, s + i);
+			vX2 = vec_xl(16, s + i);
+			vX3 = vec_xl(32, s + i);
+			vX4 = vec_xl(48, s + i);
+
+			vlo0 = EC_vec_xl(0, g0);
+			vhi0 = EC_vec_xl(16, g0);
+			vlo1 = EC_vec_xl(0, g1);
+			vhi1 = EC_vec_xl(16, g1);
+
+			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+			g0 += 32;
+			g1 += 32;
+		}
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+	}
+	return;
+}
--- a/erasure_code/ppc64le/gf_2vect_mad_vsx.c
+++ b/erasure_code/ppc64le/gf_2vect_mad_vsx.c
@ -0,0 +1,65 @@
+#include "ec_base_vsx.h"
+
+void gf_2vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+		      unsigned char *src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4;
+	vector unsigned char vYD, vYE, vYF, vYG;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+
+	for (i = head; i < len - 63; i += 64) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vY1 = vec_xl(0, t0 + i);
+		vY2 = vec_xl(16, t0 + i);
+		vYD = vec_xl(32, t0 + i);
+		vYE = vec_xl(48, t0 + i);
+
+		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vY3 = vec_xl(0, t1 + i);
+		vY4 = vec_xl(16, t1 + i);
+		vYF = vec_xl(32, t1 + i);
+		vYG = vec_xl(48, t1 + i);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+
+		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+	}
+	return;
+}
--- a/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c
+++ b/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c
@ -0,0 +1,104 @@
+#include "ec_base_vsx.h"
+
+void gf_3vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+	int i, j, head;
+
+	if (vlen < 128) {
+		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+		gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+
+		for (j = 1; j < vlen; j++) {
+			gf_3vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+		}
+		return;
+	}
+
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+	}
+
+	for (i = head; i < len - 63; i += 64) {
+		vY1 = vY1 ^ vY1;
+		vY2 = vY2 ^ vY2;
+		vY3 = vY3 ^ vY3;
+		vY4 = vY4 ^ vY4;
+		vY5 = vY5 ^ vY5;
+		vY6 = vY6 ^ vY6;
+
+		vYD = vYD ^ vYD;
+		vYE = vYE ^ vYE;
+		vYF = vYF ^ vYF;
+		vYG = vYG ^ vYG;
+		vYH = vYH ^ vYH;
+		vYI = vYI ^ vYI;
+
+		unsigned char *g0 = &gftbls[0 * 32 * vlen];
+		unsigned char *g1 = &gftbls[1 * 32 * vlen];
+		unsigned char *g2 = &gftbls[2 * 32 * vlen];
+
+		for (j = 0; j < vlen; j++) {
+			s = (unsigned char *)src[j];
+			vX1 = vec_xl(0, s + i);
+			vX2 = vec_xl(16, s + i);
+			vX3 = vec_xl(32, s + i);
+			vX4 = vec_xl(48, s + i);
+
+			vlo0 = EC_vec_xl(0, g0);
+			vhi0 = EC_vec_xl(16, g0);
+			vlo1 = EC_vec_xl(0, g1);
+			vhi1 = EC_vec_xl(16, g1);
+
+			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+			vlo2 = vec_xl(0, g2);
+			vhi2 = vec_xl(16, g2);
+
+			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+			vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+			vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+			vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+			vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+			g0 += 32;
+			g1 += 32;
+			g2 += 32;
+		}
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+	}
+	return;
+}
--- a/erasure_code/ppc64le/gf_3vect_mad_vsx.c
+++ b/erasure_code/ppc64le/gf_3vect_mad_vsx.c
@ -0,0 +1,84 @@
+#include "ec_base_vsx.h"
+
+void gf_3vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+		      unsigned char *src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+
+	for (i = head; i < len - 63; i += 64) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vY1 = vec_xl(0, t0 + i);
+		vY2 = vec_xl(16, t0 + i);
+		vYD = vec_xl(32, t0 + i);
+		vYE = vec_xl(48, t0 + i);
+
+		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vY3 = vec_xl(0, t1 + i);
+		vY4 = vec_xl(16, t1 + i);
+		vYF = vec_xl(32, t1 + i);
+		vYG = vec_xl(48, t1 + i);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+
+		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+		vY5 = vec_xl(0, t2 + i);
+		vY6 = vec_xl(16, t2 + i);
+		vYH = vec_xl(32, t2 + i);
+		vYI = vec_xl(48, t2 + i);
+
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+
+		vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+		vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+		vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+		vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+	}
+	return;
+}
--- a/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c
+++ b/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c
@ -0,0 +1,124 @@
+#include "ec_base_vsx.h"
+
+void gf_4vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2, *t3;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3;
+	int i, j, head;
+
+	if (vlen < 128) {
+		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+		gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+		gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
+
+		for (j = 1; j < vlen; j++) {
+			gf_4vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+		}
+		return;
+	}
+
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+	t3 = (unsigned char *)dest[3];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
+	}
+
+	for (i = head; i < len - 63; i += 64) {
+		vY1 = vY1 ^ vY1;
+		vY2 = vY2 ^ vY2;
+		vY3 = vY3 ^ vY3;
+		vY4 = vY4 ^ vY4;
+		vY5 = vY5 ^ vY5;
+		vY6 = vY6 ^ vY6;
+		vY7 = vY7 ^ vY7;
+		vY8 = vY8 ^ vY8;
+
+		vYD = vYD ^ vYD;
+		vYE = vYE ^ vYE;
+		vYF = vYF ^ vYF;
+		vYG = vYG ^ vYG;
+		vYH = vYH ^ vYH;
+		vYI = vYI ^ vYI;
+		vYJ = vYJ ^ vYJ;
+		vYK = vYK ^ vYK;
+
+		unsigned char *g0 = &gftbls[0 * 32 * vlen];
+		unsigned char *g1 = &gftbls[1 * 32 * vlen];
+		unsigned char *g2 = &gftbls[2 * 32 * vlen];
+		unsigned char *g3 = &gftbls[3 * 32 * vlen];
+
+		for (j = 0; j < vlen; j++) {
+			s = (unsigned char *)src[j];
+			vX1 = vec_xl(0, s + i);
+			vX2 = vec_xl(16, s + i);
+			vX3 = vec_xl(32, s + i);
+			vX4 = vec_xl(48, s + i);
+
+			vlo0 = EC_vec_xl(0, g0);
+			vhi0 = EC_vec_xl(16, g0);
+			vlo1 = EC_vec_xl(0, g1);
+			vhi1 = EC_vec_xl(16, g1);
+
+			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+			vlo2 = vec_xl(0, g2);
+			vhi2 = vec_xl(16, g2);
+			vlo3 = vec_xl(0, g3);
+			vhi3 = vec_xl(16, g3);
+
+			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+			vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+			vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+			vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+			vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+			vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+			vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+			vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+			vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+			g0 += 32;
+			g1 += 32;
+			g2 += 32;
+			g3 += 32;
+		}
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vY7, 0, t3 + i);
+		vec_xst(vY8, 16, t3 + i);
+
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+		vec_xst(vYJ, 32, t3 + i);
+		vec_xst(vYK, 48, t3 + i);
+	}
+	return;
+}
--- a/erasure_code/ppc64le/gf_4vect_mad_vsx.c
+++ b/erasure_code/ppc64le/gf_4vect_mad_vsx.c
@ -0,0 +1,103 @@
+#include "ec_base_vsx.h"
+
+void gf_4vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+		      unsigned char *src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2, *t3;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+	t3 = (unsigned char *)dest[3];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+	vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+
+	for (i = head; i < len - 63; i += 64) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vY1 = vec_xl(0, t0 + i);
+		vY2 = vec_xl(16, t0 + i);
+		vYD = vec_xl(32, t0 + i);
+		vYE = vec_xl(48, t0 + i);
+
+		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vY3 = vec_xl(0, t1 + i);
+		vY4 = vec_xl(16, t1 + i);
+		vYF = vec_xl(32, t1 + i);
+		vYG = vec_xl(48, t1 + i);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+
+		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+		vY5 = vec_xl(0, t2 + i);
+		vY6 = vec_xl(16, t2 + i);
+		vYH = vec_xl(32, t2 + i);
+		vYI = vec_xl(48, t2 + i);
+
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+
+		vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+		vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+		vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+		vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+		vY7 = vec_xl(0, t3 + i);
+		vY8 = vec_xl(16, t3 + i);
+		vYJ = vec_xl(32, t3 + i);
+		vYK = vec_xl(48, t3 + i);
+
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+
+		vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+		vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+		vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+		vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+		vec_xst(vY7, 0, t3 + i);
+		vec_xst(vY8, 16, t3 + i);
+		vec_xst(vYJ, 32, t3 + i);
+		vec_xst(vYK, 48, t3 + i);
+	}
+	return;
+}
--- a/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c
+++ b/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c
@ -0,0 +1,145 @@
+#include "ec_base_vsx.h"
+
+void gf_5vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2, *t3, *t4;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3, vhi4, vlo4;
+	int i, j, head;
+
+	if (vlen < 128) {
+		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+		gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+		gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
+		gf_vect_mul_vsx(len, &gftbls[4 * 32 * vlen], src[0], (unsigned char *)dest[4]);
+
+		for (j = 1; j < vlen; j++) {
+			gf_5vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+		}
+		return;
+	}
+
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+	t3 = (unsigned char *)dest[3];
+	t4 = (unsigned char *)dest[4];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[4 * 32 * vlen], src, t4);
+	}
+
+	for (i = head; i < len - 63; i += 64) {
+		vY1 = vY1 ^ vY1;
+		vY2 = vY2 ^ vY2;
+		vY3 = vY3 ^ vY3;
+		vY4 = vY4 ^ vY4;
+		vY5 = vY5 ^ vY5;
+		vY6 = vY6 ^ vY6;
+		vY7 = vY7 ^ vY7;
+		vY8 = vY8 ^ vY8;
+		vY9 = vY9 ^ vY9;
+		vYA = vYA ^ vYA;
+
+		vYD = vYD ^ vYD;
+		vYE = vYE ^ vYE;
+		vYF = vYF ^ vYF;
+		vYG = vYG ^ vYG;
+		vYH = vYH ^ vYH;
+		vYI = vYI ^ vYI;
+		vYJ = vYJ ^ vYJ;
+		vYK = vYK ^ vYK;
+		vYL = vYL ^ vYL;
+		vYM = vYM ^ vYM;
+
+		unsigned char *g0 = &gftbls[0 * 32 * vlen];
+		unsigned char *g1 = &gftbls[1 * 32 * vlen];
+		unsigned char *g2 = &gftbls[2 * 32 * vlen];
+		unsigned char *g3 = &gftbls[3 * 32 * vlen];
+		unsigned char *g4 = &gftbls[4 * 32 * vlen];
+
+		for (j = 0; j < vlen; j++) {
+			s = (unsigned char *)src[j];
+			vX1 = vec_xl(0, s + i);
+			vX2 = vec_xl(16, s + i);
+			vX3 = vec_xl(32, s + i);
+			vX4 = vec_xl(48, s + i);
+
+			vlo0 = EC_vec_xl(0, g0);
+			vhi0 = EC_vec_xl(16, g0);
+			vlo1 = EC_vec_xl(0, g1);
+			vhi1 = EC_vec_xl(16, g1);
+
+			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+			vlo2 = vec_xl(0, g2);
+			vhi2 = vec_xl(16, g2);
+			vlo3 = vec_xl(0, g3);
+			vhi3 = vec_xl(16, g3);
+
+			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+			vlo4 = vec_xl(0, g4);
+			vhi4 = vec_xl(16, g4);
+
+			vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+			vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+			vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+			vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+			vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+			vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+			vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+			vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+			vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+			vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+			vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+			vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+			g0 += 32;
+			g1 += 32;
+			g2 += 32;
+			g3 += 32;
+			g4 += 32;
+		}
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vY7, 0, t3 + i);
+		vec_xst(vY8, 16, t3 + i);
+		vec_xst(vY9, 0, t4 + i);
+		vec_xst(vYA, 16, t4 + i);
+
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+		vec_xst(vYJ, 32, t3 + i);
+		vec_xst(vYK, 48, t3 + i);
+		vec_xst(vYL, 32, t4 + i);
+		vec_xst(vYM, 48, t4 + i);
+	}
+	return;
+}
--- a/erasure_code/ppc64le/gf_5vect_mad_vsx.c
+++ b/erasure_code/ppc64le/gf_5vect_mad_vsx.c
@ -0,0 +1,122 @@
+#include "ec_base_vsx.h"
+
+void gf_5vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+		      unsigned char *src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2, *t3, *t4;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3, vhi4, vlo4;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+	t3 = (unsigned char *)dest[3];
+	t4 = (unsigned char *)dest[4];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[4 * 32 * vec], src, t4);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+	vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+	vlo4 = EC_vec_xl(0, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+	vhi4 = EC_vec_xl(16, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+
+	for (i = head; i < len - 63; i += 64) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vY1 = vec_xl(0, t0 + i);
+		vY2 = vec_xl(16, t0 + i);
+		vYD = vec_xl(32, t0 + i);
+		vYE = vec_xl(48, t0 + i);
+
+		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vY3 = vec_xl(0, t1 + i);
+		vY4 = vec_xl(16, t1 + i);
+		vYF = vec_xl(32, t1 + i);
+		vYG = vec_xl(48, t1 + i);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+
+		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+		vY5 = vec_xl(0, t2 + i);
+		vY6 = vec_xl(16, t2 + i);
+		vYH = vec_xl(32, t2 + i);
+		vYI = vec_xl(48, t2 + i);
+
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+
+		vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+		vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+		vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+		vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+		vY7 = vec_xl(0, t3 + i);
+		vY8 = vec_xl(16, t3 + i);
+		vYJ = vec_xl(32, t3 + i);
+		vYK = vec_xl(48, t3 + i);
+
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+
+		vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+		vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+		vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+		vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+		vY9 = vec_xl(0, t4 + i);
+		vYA = vec_xl(16, t4 + i);
+		vYL = vec_xl(32, t4 + i);
+		vYM = vec_xl(48, t4 + i);
+
+		vec_xst(vY7, 0, t3 + i);
+		vec_xst(vY8, 16, t3 + i);
+		vec_xst(vYJ, 32, t3 + i);
+		vec_xst(vYK, 48, t3 + i);
+
+		vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+		vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+		vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+		vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+		vec_xst(vY9, 0, t4 + i);
+		vec_xst(vYA, 16, t4 + i);
+		vec_xst(vYL, 32, t4 + i);
+		vec_xst(vYM, 48, t4 + i);
+	}
+	return;
+}
--- a/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c
+++ b/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c
@ -0,0 +1,166 @@
+#include "ec_base_vsx.h"
+
+void gf_6vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2, *t3, *t4, *t5;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA, vYB, vYC;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM, vYN, vYO;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+	vector unsigned char vhi3, vlo3, vhi4, vlo4, vhi5, vlo5;
+	int i, j, head;
+
+	if (vlen < 128) {
+		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+		gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+		gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
+		gf_vect_mul_vsx(len, &gftbls[4 * 32 * vlen], src[0], (unsigned char *)dest[4]);
+		gf_vect_mul_vsx(len, &gftbls[5 * 32 * vlen], src[0], (unsigned char *)dest[5]);
+
+		for (j = 1; j < vlen; j++) {
+			gf_6vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+		}
+		return;
+	}
+
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+	t3 = (unsigned char *)dest[3];
+	t4 = (unsigned char *)dest[4];
+	t5 = (unsigned char *)dest[5];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[4 * 32 * vlen], src, t4);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[5 * 32 * vlen], src, t5);
+	}
+
+	for (i = head; i < len - 63; i += 64) {
+		vY1 = vY1 ^ vY1;
+		vY2 = vY2 ^ vY2;
+		vY3 = vY3 ^ vY3;
+		vY4 = vY4 ^ vY4;
+		vY5 = vY5 ^ vY5;
+		vY6 = vY6 ^ vY6;
+		vY7 = vY7 ^ vY7;
+		vY8 = vY8 ^ vY8;
+		vY9 = vY9 ^ vY9;
+		vYA = vYA ^ vYA;
+		vYB = vYB ^ vYB;
+		vYC = vYC ^ vYC;
+
+		vYD = vYD ^ vYD;
+		vYE = vYE ^ vYE;
+		vYF = vYF ^ vYF;
+		vYG = vYG ^ vYG;
+		vYH = vYH ^ vYH;
+		vYI = vYI ^ vYI;
+		vYJ = vYJ ^ vYJ;
+		vYK = vYK ^ vYK;
+		vYL = vYL ^ vYL;
+		vYM = vYM ^ vYM;
+		vYN = vYN ^ vYN;
+		vYO = vYO ^ vYO;
+
+		unsigned char *g0 = &gftbls[0 * 32 * vlen];
+		unsigned char *g1 = &gftbls[1 * 32 * vlen];
+		unsigned char *g2 = &gftbls[2 * 32 * vlen];
+		unsigned char *g3 = &gftbls[3 * 32 * vlen];
+		unsigned char *g4 = &gftbls[4 * 32 * vlen];
+		unsigned char *g5 = &gftbls[5 * 32 * vlen];
+
+		for (j = 0; j < vlen; j++) {
+			s = (unsigned char *)src[j];
+			vX1 = vec_xl(0, s + i);
+			vX2 = vec_xl(16, s + i);
+			vX3 = vec_xl(32, s + i);
+			vX4 = vec_xl(48, s + i);
+
+			vlo0 = EC_vec_xl(0, g0);
+			vhi0 = EC_vec_xl(16, g0);
+			vlo1 = EC_vec_xl(0, g1);
+			vhi1 = EC_vec_xl(16, g1);
+
+			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+			vlo2 = EC_vec_xl(0, g2);
+			vhi2 = EC_vec_xl(16, g2);
+			vlo3 = EC_vec_xl(0, g3);
+			vhi3 = EC_vec_xl(16, g3);
+
+			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+			vlo4 = EC_vec_xl(0, g4);
+			vhi4 = EC_vec_xl(16, g4);
+			vlo5 = EC_vec_xl(0, g5);
+			vhi5 = EC_vec_xl(16, g5);
+
+			vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+			vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+			vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+			vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+			vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+			vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+			vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+			vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+			vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+			vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+			vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+			vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+			vYB = vYB ^ EC_vec_permxor(vhi5, vlo5, vX1);
+			vYC = vYC ^ EC_vec_permxor(vhi5, vlo5, vX2);
+			vYN = vYN ^ EC_vec_permxor(vhi5, vlo5, vX3);
+			vYO = vYO ^ EC_vec_permxor(vhi5, vlo5, vX4);
+
+			g0 += 32;
+			g1 += 32;
+			g2 += 32;
+			g3 += 32;
+			g4 += 32;
+			g5 += 32;
+		}
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vY7, 0, t3 + i);
+		vec_xst(vY8, 16, t3 + i);
+		vec_xst(vY9, 0, t4 + i);
+		vec_xst(vYA, 16, t4 + i);
+		vec_xst(vYB, 0, t5 + i);
+		vec_xst(vYC, 16, t5 + i);
+
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+		vec_xst(vYJ, 32, t3 + i);
+		vec_xst(vYK, 48, t3 + i);
+		vec_xst(vYL, 32, t4 + i);
+		vec_xst(vYM, 48, t4 + i);
+		vec_xst(vYN, 32, t5 + i);
+		vec_xst(vYO, 48, t5 + i);
+	}
+	return;
+}
--- a/erasure_code/ppc64le/gf_6vect_mad_vsx.c
+++ b/erasure_code/ppc64le/gf_6vect_mad_vsx.c
@ -0,0 +1,142 @@
+#include "ec_base_vsx.h"
+
+void gf_6vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+		      unsigned char *src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2, *t3, *t4, *t5;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA, vYB, vYC;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM, vYN, vYO;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+	vector unsigned char vhi3, vlo3, vhi4, vlo4, vhi5, vlo5;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+	t3 = (unsigned char *)dest[3];
+	t4 = (unsigned char *)dest[4];
+	t5 = (unsigned char *)dest[5];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[4 * 32 * vec], src, t4);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[5 * 32 * vec], src, t5);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+	vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+	vlo4 = EC_vec_xl(0, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+	vhi4 = EC_vec_xl(16, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+	vlo5 = EC_vec_xl(0, gftbls + (((5 * vec) << 5) + (vec_i << 5)));
+	vhi5 = EC_vec_xl(16, gftbls + (((5 * vec) << 5) + (vec_i << 5)));
+
+	for (i = head; i < len - 63; i += 64) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vY1 = vec_xl(0, t0 + i);
+		vY2 = vec_xl(16, t0 + i);
+		vYD = vec_xl(32, t0 + i);
+		vYE = vec_xl(48, t0 + i);
+
+		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+
+		vY3 = vec_xl(0, t1 + i);
+		vY4 = vec_xl(16, t1 + i);
+		vYF = vec_xl(32, t1 + i);
+		vYG = vec_xl(48, t1 + i);
+
+		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+
+		vY5 = vec_xl(0, t2 + i);
+		vY6 = vec_xl(16, t2 + i);
+		vYH = vec_xl(32, t2 + i);
+		vYI = vec_xl(48, t2 + i);
+
+		vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+		vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+		vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+		vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+		vY7 = vec_xl(0, t3 + i);
+		vY8 = vec_xl(16, t3 + i);
+		vYJ = vec_xl(32, t3 + i);
+		vYK = vec_xl(48, t3 + i);
+
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+
+		vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+		vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+		vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+		vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+		vY9 = vec_xl(0, t4 + i);
+		vYA = vec_xl(16, t4 + i);
+		vYL = vec_xl(32, t4 + i);
+		vYM = vec_xl(48, t4 + i);
+
+		vec_xst(vY7, 0, t3 + i);
+		vec_xst(vY8, 16, t3 + i);
+		vec_xst(vYJ, 32, t3 + i);
+		vec_xst(vYK, 48, t3 + i);
+
+		vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+		vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+		vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+		vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+		vYB = vec_xl(0, t5 + i);
+		vYC = vec_xl(16, t5 + i);
+		vYN = vec_xl(32, t5 + i);
+		vYO = vec_xl(48, t5 + i);
+
+		vec_xst(vY9, 0, t4 + i);
+		vec_xst(vYA, 16, t4 + i);
+		vec_xst(vYL, 32, t4 + i);
+		vec_xst(vYM, 48, t4 + i);
+
+		vYB = vYB ^ EC_vec_permxor(vhi5, vlo5, vX1);
+		vYC = vYC ^ EC_vec_permxor(vhi5, vlo5, vX2);
+		vYN = vYN ^ EC_vec_permxor(vhi5, vlo5, vX3);
+		vYO = vYO ^ EC_vec_permxor(vhi5, vlo5, vX4);
+
+		vec_xst(vYB, 0, t5 + i);
+		vec_xst(vYC, 16, t5 + i);
+		vec_xst(vYN, 32, t5 + i);
+		vec_xst(vYO, 48, t5 + i);
+	}
+	return;
+}
--- a/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c
+++ b/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c
@ -0,0 +1,85 @@
+#include "ec_base_vsx.h"
+
+void gf_vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			  unsigned char **src, unsigned char *dest)
+{
+	unsigned char *s, *t0;
+	vector unsigned char vX1, vY1;
+	vector unsigned char vX2, vY2;
+	vector unsigned char vX3, vY3;
+	vector unsigned char vX4, vY4;
+	vector unsigned char vX5, vY5;
+	vector unsigned char vX6, vY6;
+	vector unsigned char vX7, vY7;
+	vector unsigned char vX8, vY8;
+	vector unsigned char vhi0, vlo0;
+	int i, j, head;
+
+	if (vlen < 128) {
+		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest);
+
+		for (j = 1; j < vlen; j++) {
+			gf_vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+		}
+		return;
+	}
+
+	t0 = (unsigned char *)dest;
+
+	head = len % 128;
+	if (head != 0) {
+		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+	}
+
+	for (i = head; i < len - 127; i += 128) {
+		vY1 = vY1 ^ vY1;
+		vY2 = vY2 ^ vY2;
+		vY3 = vY3 ^ vY3;
+		vY4 = vY4 ^ vY4;
+
+		vY5 = vY5 ^ vY5;
+		vY6 = vY6 ^ vY6;
+		vY7 = vY7 ^ vY7;
+		vY8 = vY8 ^ vY8;
+
+		unsigned char *g0 = &gftbls[0 * 32 * vlen];
+
+		for (j = 0; j < vlen; j++) {
+			s = (unsigned char *)src[j];
+			vX1 = vec_xl(0, s + i);
+			vX2 = vec_xl(16, s + i);
+			vX3 = vec_xl(32, s + i);
+			vX4 = vec_xl(48, s + i);
+
+			vlo0 = EC_vec_xl(0, g0);
+			vhi0 = EC_vec_xl(16, g0);
+
+			vX5 = vec_xl(64, s + i);
+			vX6 = vec_xl(80, s + i);
+			vX7 = vec_xl(96, s + i);
+			vX8 = vec_xl(112, s + i);
+
+			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+			vY3 = vY3 ^ EC_vec_permxor(vhi0, vlo0, vX3);
+			vY4 = vY4 ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+			vY5 = vY5 ^ EC_vec_permxor(vhi0, vlo0, vX5);
+			vY6 = vY6 ^ EC_vec_permxor(vhi0, vlo0, vX6);
+			vY7 = vY7 ^ EC_vec_permxor(vhi0, vlo0, vX7);
+			vY8 = vY8 ^ EC_vec_permxor(vhi0, vlo0, vX8);
+
+			g0 += 32;
+		}
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 32, t0 + i);
+		vec_xst(vY4, 48, t0 + i);
+
+		vec_xst(vY5, 64, t0 + i);
+		vec_xst(vY6, 80, t0 + i);
+		vec_xst(vY7, 96, t0 + i);
+		vec_xst(vY8, 112, t0 + i);
+	}
+	return;
+}
--- a/erasure_code/ppc64le/gf_vect_mad_vsx.c
+++ b/erasure_code/ppc64le/gf_vect_mad_vsx.c
@ -0,0 +1,48 @@
+#include "ec_base_vsx.h"
+
+void gf_vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+		     unsigned char *src, unsigned char *dest)
+{
+	unsigned char *s, *t0;
+	vector unsigned char vX1, vY1;
+	vector unsigned char vX2, vY2;
+	vector unsigned char vX3, vY3;
+	vector unsigned char vX4, vY4;
+	vector unsigned char vhi0, vlo0;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest;
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, dest);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+
+	for (i = head; i < len - 63; i += 64) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vY1 = vec_xl(0, t0 + i);
+		vY2 = vec_xl(16, t0 + i);
+		vY3 = vec_xl(32, t0 + i);
+		vY4 = vec_xl(48, t0 + i);
+
+		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+		vY3 = vY3 ^ EC_vec_permxor(vhi0, vlo0, vX3);
+		vY4 = vY4 ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 32, t0 + i);
+		vec_xst(vY4, 48, t0 + i);
+	}
+
+	return;
+}
--- a/erasure_code/ppc64le/gf_vect_mul_vsx.c
+++ b/erasure_code/ppc64le/gf_vect_mul_vsx.c
@ -0,0 +1,61 @@
+#include "ec_base_vsx.h"
+
+void gf_vect_mul_vsx(int len, unsigned char *gftbl, unsigned char *src, unsigned char *dest)
+{
+	unsigned char *s, *t0;
+	vector unsigned char vX1, vY1;
+	vector unsigned char vX2, vY2;
+	vector unsigned char vX3, vY3;
+	vector unsigned char vX4, vY4;
+	vector unsigned char vX5, vY5;
+	vector unsigned char vX6, vY6;
+	vector unsigned char vX7, vY7;
+	vector unsigned char vX8, vY8;
+	vector unsigned char vhi0, vlo0;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest;
+
+	head = len % 128;
+	if (head != 0) {
+		gf_vect_mul_base(head, gftbl, src, dest);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbl);
+	vhi0 = EC_vec_xl(16, gftbl);
+
+	for (i = head; i < len - 127; i += 128) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vX5 = vec_xl(64, s + i);
+		vX6 = vec_xl(80, s + i);
+		vX7 = vec_xl(96, s + i);
+		vX8 = vec_xl(112, s + i);
+
+		vY1 = EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = EC_vec_permxor(vhi0, vlo0, vX2);
+		vY3 = EC_vec_permxor(vhi0, vlo0, vX3);
+		vY4 = EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vY5 = EC_vec_permxor(vhi0, vlo0, vX5);
+		vY6 = EC_vec_permxor(vhi0, vlo0, vX6);
+		vY7 = EC_vec_permxor(vhi0, vlo0, vX7);
+		vY8 = EC_vec_permxor(vhi0, vlo0, vX8);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 32, t0 + i);
+		vec_xst(vY4, 48, t0 + i);
+
+		vec_xst(vY5, 64, t0 + i);
+		vec_xst(vY6, 80, t0 + i);
+		vec_xst(vY7, 96, t0 + i);
+		vec_xst(vY8, 112, t0 + i);
+	}
+
+	return;
+}
--- a/igzip/Makefile.am
+++ b/igzip/Makefile.am
@ -38,6 +38,7 @@ lsrc        += 	igzip/igzip.c \

 lsrc_base_aliases += igzip/igzip_base_aliases.c igzip/proc_heap_base.c
 lsrc_x86_32       += igzip/igzip_base_aliases.c igzip/proc_heap_base.c
+lsrc_ppc64le      += igzip/igzip_base_aliases.c igzip/proc_heap_base.c

 lsrc_aarch64 +=	igzip/aarch64/igzip_inflate_multibinary_arm64.S  \
 		igzip/aarch64/igzip_multibinary_arm64.S	\
--- a/mem/Makefile.am
+++ b/mem/Makefile.am
@ -32,6 +32,7 @@ include mem/aarch64/Makefile.am
 lsrc        += 	mem/mem_zero_detect_base.c

 lsrc_base_aliases += mem/mem_zero_detect_base_aliases.c
+lsrc_ppc64le      += mem/mem_zero_detect_base_aliases.c

 lsrc_x86_64 += 	mem/mem_zero_detect_avx.asm \
 		mem/mem_zero_detect_sse.asm \
--- a/raid/Makefile.am
+++ b/raid/Makefile.am
@ -32,6 +32,7 @@ include raid/aarch64/Makefile.am
 lsrc        += 	raid/raid_base.c

 lsrc_base_aliases += raid/raid_base_aliases.c
+lsrc_ppc64le      += raid/raid_base_aliases.c

 lsrc_x86_64 += \
 		raid/xor_gen_sse.asm \