ppc: Add vpx_idct16x16_256_add_vsx

Change-Id: Ibc3f7965423fd91179f8d8e77c7ae3e6d7f80572
This commit is contained in:
Alexandra Hájková 2017-06-13 17:33:32 +00:00
parent 8836e46ffd
commit 666c543f7b
3 changed files with 323 additions and 18 deletions

View File

@ -858,4 +858,11 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
VPX_BITS_8)));
#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(VSX, Trans16x16DCT,
::testing::Values(make_tuple(&vpx_fdct16x16_c,
&vpx_idct16x16_256_add_vsx,
0, VPX_BITS_8)));
#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
} // namespace

View File

@ -17,17 +17,32 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/inv_txfm.h"
static int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
16305, 16305, 16305, 16305 };
static int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
16069, 16069, 16069, 16069 };
static int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
15679, 15679, 15679, 15679 };
static int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
15137, 15137, 15137, 15137 };
static int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
14449, 14449, 14449, 14449 };
static int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
13623, 13623, 13623, 13623 };
static int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
12665, 12665, 12665, 12665 };
static int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
11585, 11585, 11585, 11585 };
static int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
10394, 10394, 10394, 10394 };
static int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 };
static int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 };
static int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 };
static int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270,
-6270, -6270, -6270, -6270 };
static int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 };
static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 };
static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 };
#define ROUND_SHIFT_INIT \
const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
@ -39,7 +54,7 @@ static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 };
int16x8_t add8 = vec_splat_s16(8); \
uint16x8_t shift4 = vec_splat_u16(4);
#define PIXEL_ADD(out, in) out = vec_sra(vec_add(in, add8), shift4);
#define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4);
#define IDCT4(in0, in1, out0, out1) \
t0 = vec_add(in0, in1); \
@ -98,8 +113,8 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
v1 = vec_mergel(t0, t1);
IDCT4(v0, v1, t_out0, t_out1);
PIXEL_ADD(v0, t_out0);
PIXEL_ADD(v1, t_out1);
PIXEL_ADD4(v0, t_out0);
PIXEL_ADD4(v1, t_out1);
tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0);
tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1);
output_v = vec_packsu(tmp16_0, tmp16_1);
@ -203,17 +218,21 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
in6 = vec_sub(step1, step6); \
in7 = vec_sub(step0, step7);
#define PIXEL_ADD8(in, out) \
out = vec_add(vec_sra(vec_add(in, add), shift5), out);
#define PIXEL_ADD(in, out, add, shiftx) \
out = vec_add(vec_sra(vec_add(in, add), shiftx), out);
static uint8x16_t tr8_mask0 = {
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
};
static uint8x16_t tr8_mask1 = {
0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
};
void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
int stride) {
int32x4_t temp10, temp11;
int16x8_t step0, step1, step2, step3, step4, step5, step6, step7;
uint8x16_t tr8_mask0 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
uint8x16_t tr8_mask1 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F };
int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1,
tmp16_2, tmp16_3;
int16x8_t src0 = vec_vsx_ld(0, input);
@ -253,14 +272,14 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2,
src3, src4, src5, src6, src7);
IDCT8(src0, src1, src2, src3, src4, src5, src6, src7);
PIXEL_ADD8(src0, d_u0);
PIXEL_ADD8(src1, d_u1);
PIXEL_ADD8(src2, d_u2);
PIXEL_ADD8(src3, d_u3);
PIXEL_ADD8(src4, d_u4);
PIXEL_ADD8(src5, d_u5);
PIXEL_ADD8(src6, d_u6);
PIXEL_ADD8(src7, d_u7);
PIXEL_ADD(src0, d_u0, add, shift5);
PIXEL_ADD(src1, d_u1, add, shift5);
PIXEL_ADD(src2, d_u2, add, shift5);
PIXEL_ADD(src3, d_u3, add, shift5);
PIXEL_ADD(src4, d_u4, add, shift5);
PIXEL_ADD(src5, d_u5, add, shift5);
PIXEL_ADD(src6, d_u6, add, shift5);
PIXEL_ADD(src7, d_u7, add, shift5);
output0 = vec_packsu(d_u0, d_u1);
output1 = vec_packsu(d_u2, d_u3);
output2 = vec_packsu(d_u4, d_u5);
@ -275,3 +294,282 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest);
vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest);
}
#define LOAD_INPUT16(source, offset, step, in0, in1, in2, in3, in4, in5, in6, \
in7, in8, in9, inA, inB, inC, inD, inE, inF) \
in0 = vec_vsx_ld(offset, source); \
in1 = vec_vsx_ld(step + offset, source); \
in2 = vec_vsx_ld(2 * step + offset, source); \
in3 = vec_vsx_ld(3 * step + offset, source); \
in4 = vec_vsx_ld(4 * step + offset, source); \
in5 = vec_vsx_ld(5 * step + offset, source); \
in6 = vec_vsx_ld(6 * step + offset, source); \
in7 = vec_vsx_ld(7 * step + offset, source); \
in8 = vec_vsx_ld(8 * step + offset, source); \
in9 = vec_vsx_ld(9 * step + offset, source); \
inA = vec_vsx_ld(10 * step + offset, source); \
inB = vec_vsx_ld(11 * step + offset, source); \
inC = vec_vsx_ld(12 * step + offset, source); \
inD = vec_vsx_ld(13 * step + offset, source); \
inE = vec_vsx_ld(14 * step + offset, source); \
inF = vec_vsx_ld(15 * step + offset, source);
#define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \
tmp16_0 = vec_mergeh(inpt0, inpt1); \
tmp16_1 = vec_mergel(inpt0, inpt1); \
temp10 = vec_mule(tmp16_0, cospi); \
temp11 = vec_mule(tmp16_1, cospi); \
temp20 = vec_mulo(tmp16_0, cospi); \
temp21 = vec_mulo(tmp16_1, cospi); \
temp30 = vec_sub(temp10, temp20); \
temp10 = vec_add(temp10, temp20); \
temp20 = vec_sub(temp11, temp21); \
temp21 = vec_add(temp11, temp21); \
DCT_CONST_ROUND_SHIFT(temp30); \
DCT_CONST_ROUND_SHIFT(temp20); \
outpt0 = vec_packs(temp30, temp20); \
DCT_CONST_ROUND_SHIFT(temp10); \
DCT_CONST_ROUND_SHIFT(temp21); \
outpt1 = vec_packs(temp10, temp21);
#define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB, \
inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6, \
out7, out8, out9, outA, outB, outC, outD, outE, outF) \
/* stage 1 */ \
/* out0 = in0; */ \
out1 = in8; \
out2 = in4; \
out3 = inC; \
out4 = in2; \
out5 = inA; \
out6 = in6; \
out7 = inE; \
out8 = in1; \
out9 = in9; \
outA = in5; \
outB = inD; \
outC = in3; \
outD = inB; \
outE = in7; \
outF = inF; \
\
/* stage 2 */ \
/* in0 = out0; */ \
in1 = out1; \
in2 = out2; \
in3 = out3; \
in4 = out4; \
in5 = out5; \
in6 = out6; \
in7 = out7; \
\
STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v); \
STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v); \
STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v); \
STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v); \
\
/* stage 3 */ \
out0 = in0; \
out1 = in1; \
out2 = in2; \
out3 = in3; \
\
STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v); \
STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v); \
\
out8 = vec_add(in8, in9); \
out9 = vec_sub(in8, in9); \
outA = vec_sub(inB, inA); \
outB = vec_add(inA, inB); \
outC = vec_add(inC, inD); \
outD = vec_sub(inC, inD); \
outE = vec_sub(inF, inE); \
outF = vec_add(inE, inF); \
\
/* stage 4 */ \
STEP16_1(out0, out1, in1, in0, cospi16_v); \
STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v); \
in4 = vec_add(out4, out5); \
in5 = vec_sub(out4, out5); \
in6 = vec_sub(out7, out6); \
in7 = vec_add(out6, out7); \
\
in8 = out8; \
inF = outF; \
tmp16_0 = vec_mergeh(out9, outE); \
tmp16_1 = vec_mergel(out9, outE); \
temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
DCT_CONST_ROUND_SHIFT(temp10); \
DCT_CONST_ROUND_SHIFT(temp11); \
in9 = vec_packs(temp10, temp11); \
temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \
DCT_CONST_ROUND_SHIFT(temp10); \
DCT_CONST_ROUND_SHIFT(temp11); \
inE = vec_packs(temp10, temp11); \
\
tmp16_0 = vec_mergeh(outA, outD); \
tmp16_1 = vec_mergel(outA, outD); \
temp10 = \
vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v)); \
temp11 = \
vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v)); \
DCT_CONST_ROUND_SHIFT(temp10); \
DCT_CONST_ROUND_SHIFT(temp11); \
inA = vec_packs(temp10, temp11); \
temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
DCT_CONST_ROUND_SHIFT(temp10); \
DCT_CONST_ROUND_SHIFT(temp11); \
inD = vec_packs(temp10, temp11); \
\
inB = outB; \
inC = outC; \
\
/* stage 5 */ \
out0 = vec_add(in0, in3); \
out1 = vec_add(in1, in2); \
out2 = vec_sub(in1, in2); \
out3 = vec_sub(in0, in3); \
out4 = in4; \
STEP16_1(in6, in5, out5, out6, cospi16_v); \
out7 = in7; \
\
out8 = vec_add(in8, inB); \
out9 = vec_add(in9, inA); \
outA = vec_sub(in9, inA); \
outB = vec_sub(in8, inB); \
outC = vec_sub(inF, inC); \
outD = vec_sub(inE, inD); \
outE = vec_add(inD, inE); \
outF = vec_add(inC, inF); \
\
/* stage 6 */ \
in0 = vec_add(out0, out7); \
in1 = vec_add(out1, out6); \
in2 = vec_add(out2, out5); \
in3 = vec_add(out3, out4); \
in4 = vec_sub(out3, out4); \
in5 = vec_sub(out2, out5); \
in6 = vec_sub(out1, out6); \
in7 = vec_sub(out0, out7); \
in8 = out8; \
in9 = out9; \
STEP16_1(outD, outA, inA, inD, cospi16_v); \
STEP16_1(outC, outB, inB, inC, cospi16_v); \
inE = outE; \
inF = outF; \
\
/* stage 7 */ \
out0 = vec_add(in0, inF); \
out1 = vec_add(in1, inE); \
out2 = vec_add(in2, inD); \
out3 = vec_add(in3, inC); \
out4 = vec_add(in4, inB); \
out5 = vec_add(in5, inA); \
out6 = vec_add(in6, in9); \
out7 = vec_add(in7, in8); \
out8 = vec_sub(in7, in8); \
out9 = vec_sub(in6, in9); \
outA = vec_sub(in5, inA); \
outB = vec_sub(in4, inB); \
outC = vec_sub(in3, inC); \
outD = vec_sub(in2, inD); \
outE = vec_sub(in1, inE); \
outF = vec_sub(in0, inF);
#define PIXEL_ADD_STORE16(in0, in1, dst, offset) \
d_uh = (int16x8_t)vec_mergeh(dst, zerov); \
d_ul = (int16x8_t)vec_mergel(dst, zerov); \
PIXEL_ADD(in0, d_uh, add, shift6); \
PIXEL_ADD(in1, d_ul, add, shift6); \
vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest);
void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
int stride) {
int32x4_t temp10, temp11, temp20, temp21, temp30;
int16x8_t src00, src01, src02, src03, src04, src05, src06, src07, src10,
src11, src12, src13, src14, src15, src16, src17;
int16x8_t src20, src21, src22, src23, src24, src25, src26, src27, src30,
src31, src32, src33, src34, src35, src36, src37;
int16x8_t tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10,
tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp16_0, tmp16_1;
int16x8_t tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30,
tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37;
uint8x16_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8,
dest9, destA, destB, destC, destD, destE, destF;
int16x8_t d_uh, d_ul;
int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
uint16x8_t shift6 = vec_splat_u16(6);
uint8x16_t zerov = vec_splat_u8(0);
ROUND_SHIFT_INIT;
// transform rows
// load and transform the upper half of 16x16 matrix
LOAD_INPUT16(input, 0, 16, src00, src10, src01, src11, src02, src12, src03,
src13, src04, src14, src05, src15, src06, src16, src07, src17);
TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, tmp11,
tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, src00, src01, src02, src03,
src04, src05, src06, src07, src10, src11, src12, src13, src14, src15,
src16, src17);
TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
// load and transform the lower half of 16x16 matrix
LOAD_INPUT16(input, 16 * 16, 16, src20, src30, src21, src31, src22, src32,
src23, src33, src24, src34, src25, src35, src26, src36, src27,
src37);
TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
IDCT16(tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, tmp31,
tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src20, src21, src22, src23,
src24, src25, src26, src27, src30, src31, src32, src33, src34, src35,
src36, src37);
TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
// transform columns
// left half first
IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp20, tmp21,
tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, src00, src01, src02, src03,
src04, src05, src06, src07, src20, src21, src22, src23, src24, src25,
src26, src27);
// right half
IDCT16(tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp30, tmp31,
tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src10, src11, src12, src13,
src14, src15, src16, src17, src30, src31, src32, src33, src34, src35,
src36, src37);
// load dest
LOAD_INPUT16(dest, 0, stride, dest0, dest1, dest2, dest3, dest4, dest5, dest6,
dest7, dest8, dest9, destA, destB, destC, destD, destE, destF);
PIXEL_ADD_STORE16(src00, src10, dest0, 0);
PIXEL_ADD_STORE16(src01, src11, dest1, stride);
PIXEL_ADD_STORE16(src02, src12, dest2, 2 * stride);
PIXEL_ADD_STORE16(src03, src13, dest3, 3 * stride);
PIXEL_ADD_STORE16(src04, src14, dest4, 4 * stride);
PIXEL_ADD_STORE16(src05, src15, dest5, 5 * stride);
PIXEL_ADD_STORE16(src06, src16, dest6, 6 * stride);
PIXEL_ADD_STORE16(src07, src17, dest7, 7 * stride);
PIXEL_ADD_STORE16(src20, src30, dest8, 8 * stride);
PIXEL_ADD_STORE16(src21, src31, dest9, 9 * stride);
PIXEL_ADD_STORE16(src22, src32, destA, 10 * stride);
PIXEL_ADD_STORE16(src23, src33, destB, 11 * stride);
PIXEL_ADD_STORE16(src24, src34, destC, 12 * stride);
PIXEL_ADD_STORE16(src25, src35, destD, 13 * stride);
PIXEL_ADD_STORE16(src26, src36, destE, 14 * stride);
PIXEL_ADD_STORE16(src27, src37, destF, 15 * stride);
}

View File

@ -589,7 +589,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
specialize qw/vpx_idct8x8_64_add neon sse2 vsx/;
specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/;
specialize qw/vpx_idct8x8_1_add neon sse2/;
specialize qw/vpx_idct16x16_256_add neon sse2/;
specialize qw/vpx_idct16x16_256_add neon sse2 vsx/;
specialize qw/vpx_idct16x16_38_add neon sse2/;
$vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2;
specialize qw/vpx_idct16x16_10_add neon sse2/;