Merge "4x4 hybrid transform type V_DCT to H_FLIPADST SSE2 optimization" into nextgenv2
This commit is contained in:
@@ -119,6 +119,58 @@ TEST_P(VP10HighbdTrans4x4HT, HighbdCoeffCheck) {
|
|||||||
}
|
}
|
||||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||||
|
|
||||||
|
#if CONFIG_EXT_TX
|
||||||
|
TEST(VP10Trans4x4HTSpeedTest, C_version) {
|
||||||
|
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||||
|
const int count_test_block = 20000;
|
||||||
|
int bit_depth = 8;
|
||||||
|
int mask = (1 << bit_depth) - 1;
|
||||||
|
const int num_coeffs = 16;
|
||||||
|
int16_t *input = new int16_t[num_coeffs];
|
||||||
|
tran_low_t *output = new tran_low_t[num_coeffs];
|
||||||
|
const int stride = 4;
|
||||||
|
int tx_type;
|
||||||
|
|
||||||
|
for (int i = 0; i < count_test_block; ++i) {
|
||||||
|
for (int j = 0; j < num_coeffs; ++j) {
|
||||||
|
input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask);
|
||||||
|
}
|
||||||
|
for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) {
|
||||||
|
vp10_fht4x4_c(input, output, stride, tx_type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] input;
|
||||||
|
delete[] output;
|
||||||
|
}
|
||||||
|
#endif // CONFIG_EXT_TX
|
||||||
|
|
||||||
|
#if HAVE_SSE2 && CONFIG_EXT_TX
|
||||||
|
TEST(VP10Trans4x4HTSpeedTest, SSE2_version) {
|
||||||
|
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||||
|
const int count_test_block = 20000;
|
||||||
|
int bit_depth = 8;
|
||||||
|
int mask = (1 << bit_depth) - 1;
|
||||||
|
const int num_coeffs = 16;
|
||||||
|
int16_t *input = new int16_t[num_coeffs];
|
||||||
|
tran_low_t *output = new tran_low_t[num_coeffs];
|
||||||
|
const int stride = 4;
|
||||||
|
int tx_type;
|
||||||
|
|
||||||
|
for (int i = 0; i < count_test_block; ++i) {
|
||||||
|
for (int j = 0; j < num_coeffs; ++j) {
|
||||||
|
input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask);
|
||||||
|
}
|
||||||
|
for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) {
|
||||||
|
vp10_fht4x4_sse2(input, output, stride, tx_type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] input;
|
||||||
|
delete[] output;
|
||||||
|
}
|
||||||
|
#endif // HAVE_SSE2 && CONFIG_EXT_TX
|
||||||
|
|
||||||
using std::tr1::make_tuple;
|
using std::tr1::make_tuple;
|
||||||
|
|
||||||
#if HAVE_SSE2
|
#if HAVE_SSE2
|
||||||
@@ -152,6 +204,18 @@ INSTANTIATE_TEST_CASE_P(
|
|||||||
make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 7,
|
make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 7,
|
||||||
VPX_BITS_8, 16),
|
VPX_BITS_8, 16),
|
||||||
make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 8,
|
make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 8,
|
||||||
|
VPX_BITS_8, 16),
|
||||||
|
make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 10,
|
||||||
|
VPX_BITS_8, 16),
|
||||||
|
make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 11,
|
||||||
|
VPX_BITS_8, 16),
|
||||||
|
make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 12,
|
||||||
|
VPX_BITS_8, 16),
|
||||||
|
make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 13,
|
||||||
|
VPX_BITS_8, 16),
|
||||||
|
make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 14,
|
||||||
|
VPX_BITS_8, 16),
|
||||||
|
make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 15,
|
||||||
VPX_BITS_8, 16)));
|
VPX_BITS_8, 16)));
|
||||||
#endif // !CONFIG_EXT_TX
|
#endif // !CONFIG_EXT_TX
|
||||||
#endif // HAVE_SSE2
|
#endif // HAVE_SSE2
|
||||||
|
@@ -62,7 +62,7 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
|
|||||||
case H_ADST:
|
case H_ADST:
|
||||||
case V_FLIPADST:
|
case V_FLIPADST:
|
||||||
case H_FLIPADST:
|
case H_FLIPADST:
|
||||||
vp10_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
|
vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
|
||||||
break;
|
break;
|
||||||
case IDTX:
|
case IDTX:
|
||||||
vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
|
vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
|
||||||
|
@@ -172,6 +172,41 @@ static void fadst4_sse2(__m128i *in) {
|
|||||||
transpose_4x4(in);
|
transpose_4x4(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if CONFIG_EXT_TX
|
||||||
|
static void fidtx4_sse2(__m128i *in) {
|
||||||
|
const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
|
||||||
|
const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
|
||||||
|
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
|
||||||
|
|
||||||
|
__m128i v0, v1, v2, v3;
|
||||||
|
__m128i u0, u1, u2, u3;
|
||||||
|
|
||||||
|
v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
|
||||||
|
v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
|
||||||
|
v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
|
||||||
|
v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
|
||||||
|
|
||||||
|
u0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
|
||||||
|
u1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
|
||||||
|
u2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
|
||||||
|
u3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
|
||||||
|
|
||||||
|
v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
|
||||||
|
v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
|
||||||
|
v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
|
||||||
|
v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
|
||||||
|
|
||||||
|
u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
|
||||||
|
u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
|
||||||
|
u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
|
||||||
|
u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
|
||||||
|
|
||||||
|
in[0] = _mm_packs_epi32(u0, u2);
|
||||||
|
in[1] = _mm_packs_epi32(u1, u3);
|
||||||
|
transpose_4x4(in);
|
||||||
|
}
|
||||||
|
#endif // CONFIG_EXT_TX
|
||||||
|
|
||||||
void vp10_fht4x4_sse2(const int16_t *input, tran_low_t *output,
|
void vp10_fht4x4_sse2(const int16_t *input, tran_low_t *output,
|
||||||
int stride, int tx_type) {
|
int stride, int tx_type) {
|
||||||
__m128i in[4];
|
__m128i in[4];
|
||||||
@@ -229,10 +264,45 @@ void vp10_fht4x4_sse2(const int16_t *input, tran_low_t *output,
|
|||||||
fadst4_sse2(in);
|
fadst4_sse2(in);
|
||||||
write_buffer_4x4(output, in);
|
write_buffer_4x4(output, in);
|
||||||
break;
|
break;
|
||||||
|
case V_DCT:
|
||||||
|
load_buffer_4x4(input, in, stride, 0, 0);
|
||||||
|
fdct4_sse2(in);
|
||||||
|
fidtx4_sse2(in);
|
||||||
|
write_buffer_4x4(output, in);
|
||||||
|
break;
|
||||||
|
case H_DCT:
|
||||||
|
load_buffer_4x4(input, in, stride, 0, 0);
|
||||||
|
fidtx4_sse2(in);
|
||||||
|
fdct4_sse2(in);
|
||||||
|
write_buffer_4x4(output, in);
|
||||||
|
break;
|
||||||
|
case V_ADST:
|
||||||
|
load_buffer_4x4(input, in, stride, 0, 0);
|
||||||
|
fadst4_sse2(in);
|
||||||
|
fidtx4_sse2(in);
|
||||||
|
write_buffer_4x4(output, in);
|
||||||
|
break;
|
||||||
|
case H_ADST:
|
||||||
|
load_buffer_4x4(input, in, stride, 0, 0);
|
||||||
|
fidtx4_sse2(in);
|
||||||
|
fadst4_sse2(in);
|
||||||
|
write_buffer_4x4(output, in);
|
||||||
|
break;
|
||||||
|
case V_FLIPADST:
|
||||||
|
load_buffer_4x4(input, in, stride, 1, 0);
|
||||||
|
fadst4_sse2(in);
|
||||||
|
fidtx4_sse2(in);
|
||||||
|
write_buffer_4x4(output, in);
|
||||||
|
break;
|
||||||
|
case H_FLIPADST:
|
||||||
|
load_buffer_4x4(input, in, stride, 0, 1);
|
||||||
|
fidtx4_sse2(in);
|
||||||
|
fadst4_sse2(in);
|
||||||
|
write_buffer_4x4(output, in);
|
||||||
|
break;
|
||||||
#endif // CONFIG_EXT_TX
|
#endif // CONFIG_EXT_TX
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user