Optimze inv 16x16 DCT with 10 non-zero coeffs - P1
This commit is the first patch optimizing SSE2 implementation of inverse 16x16 DCT with <10 non-zero coefficients. It focused on the first 1-D (row) transformation. It exploits the fact that only top-left 4x4 block contains non-zero coefficients, in a 2-D inverse 16x16 DCT with <10 coeffients. The average runtime of idct16x16_10 unit is reduced from 883 cycles -> 779 cycles (12% faster). For pedestrian_area_1080p 300 frames at 4000 kbps, the speed 2 runtime goes down from 310651 ms -> 305910 ms. The decoding speed goes up from 80.37 fps -> 80.87 fps. Change-Id: Ic6f3ac5a637a76c07ba73ddaafe318a699fea645
This commit is contained in:
parent
8fcb74e6bb
commit
ba6ab46cdc
@ -380,17 +380,13 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
|
||||
out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
|
||||
}
|
||||
|
||||
#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
|
||||
#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
|
||||
{ \
|
||||
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
|
||||
const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
|
||||
const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
|
||||
const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
|
||||
\
|
||||
in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
|
||||
in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
|
||||
in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \
|
||||
in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \
|
||||
}
|
||||
|
||||
#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
|
||||
@ -2463,112 +2459,66 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
|
||||
stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
int i;
|
||||
in[4] = in[5] = in[6] = in[7] = in[12] = in[13] = in[14] = in[15] = zero;
|
||||
// 1-D idct. Load input data.
|
||||
// in[4] = in[5] = in[6] = in[7] = in[12] = in[13] = in[14] = in[15] = zero;
|
||||
// First 1-D inverse DCT
|
||||
// Load input data.
|
||||
in[0] = _mm_load_si128((const __m128i *)input);
|
||||
in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
|
||||
in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
|
||||
in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
|
||||
in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
|
||||
in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
|
||||
in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
|
||||
in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
|
||||
|
||||
TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1], in[2], in[3]);
|
||||
TRANSPOSE_8X4(in[8], in[9], in[10], in[11], in[8], in[9], in[10], in[11]);
|
||||
TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
|
||||
|
||||
// Stage2
|
||||
{
|
||||
const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], in[11]);
|
||||
const __m128i lo_9_7 = _mm_unpackhi_epi16(in[8], in[3]);
|
||||
const __m128i lo_5_11 = _mm_unpackhi_epi16(in[2], in[9]);
|
||||
const __m128i lo_13_3 = _mm_unpackhi_epi16(in[10], in[1]);
|
||||
const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
|
||||
const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
|
||||
|
||||
tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
|
||||
tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
|
||||
tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
|
||||
tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
|
||||
tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
|
||||
tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
|
||||
tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
|
||||
tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
|
||||
|
||||
tmp0 = _mm_add_epi32(tmp0, rounding);
|
||||
tmp2 = _mm_add_epi32(tmp2, rounding);
|
||||
tmp4 = _mm_add_epi32(tmp4, rounding);
|
||||
tmp6 = _mm_add_epi32(tmp6, rounding);
|
||||
tmp1 = _mm_add_epi32(tmp1, rounding);
|
||||
tmp3 = _mm_add_epi32(tmp3, rounding);
|
||||
tmp5 = _mm_add_epi32(tmp5, rounding);
|
||||
tmp7 = _mm_add_epi32(tmp7, rounding);
|
||||
|
||||
tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
|
||||
tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
|
||||
tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
|
||||
tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
|
||||
tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
|
||||
tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
|
||||
tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
|
||||
tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
|
||||
|
||||
stp2_8 = _mm_packs_epi32(tmp0, zero);
|
||||
stp2_15 = _mm_packs_epi32(tmp2, zero);
|
||||
stp2_9 = _mm_packs_epi32(tmp4, zero);
|
||||
stp2_14 = _mm_packs_epi32(tmp6, zero);
|
||||
|
||||
stp2_10 = _mm_packs_epi32(tmp1, zero);
|
||||
stp2_13 = _mm_packs_epi32(tmp3, zero);
|
||||
stp2_11 = _mm_packs_epi32(tmp5, zero);
|
||||
stp2_12 = _mm_packs_epi32(tmp7, zero);
|
||||
stp2_8 = _mm_packs_epi32(tmp0, tmp2);
|
||||
stp2_11 = _mm_packs_epi32(tmp5, tmp7);
|
||||
}
|
||||
|
||||
// Stage3
|
||||
{
|
||||
const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], in[11]);
|
||||
const __m128i lo_10_6 = _mm_unpacklo_epi16(in[9], in[3]);
|
||||
const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
|
||||
|
||||
tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
|
||||
tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
|
||||
tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
|
||||
tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
|
||||
|
||||
tmp0 = _mm_add_epi32(tmp0, rounding);
|
||||
tmp2 = _mm_add_epi32(tmp2, rounding);
|
||||
tmp4 = _mm_add_epi32(tmp4, rounding);
|
||||
tmp6 = _mm_add_epi32(tmp6, rounding);
|
||||
|
||||
tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
|
||||
tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
|
||||
tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
|
||||
tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
|
||||
|
||||
stp1_4 = _mm_packs_epi32(tmp0, zero);
|
||||
stp1_7 = _mm_packs_epi32(tmp2, zero);
|
||||
stp1_5 = _mm_packs_epi32(tmp4, zero);
|
||||
stp1_6 = _mm_packs_epi32(tmp6, zero);
|
||||
stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
|
||||
stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
|
||||
|
||||
stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
|
||||
stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
|
||||
stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
|
||||
stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
|
||||
|
||||
stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
|
||||
stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
|
||||
stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
|
||||
stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
|
||||
stp1_4 = _mm_packs_epi32(tmp0, tmp2);
|
||||
}
|
||||
|
||||
// Stage4
|
||||
{
|
||||
const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);
|
||||
const __m128i lo_4_12 = _mm_unpacklo_epi16(in[2], in[10]);
|
||||
const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
|
||||
const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
|
||||
const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
|
||||
const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
|
||||
const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
|
||||
|
||||
tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
|
||||
tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
|
||||
tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
|
||||
tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
|
||||
tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
|
||||
tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
|
||||
tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
|
||||
@ -2576,8 +2526,6 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
|
||||
|
||||
tmp0 = _mm_add_epi32(tmp0, rounding);
|
||||
tmp2 = _mm_add_epi32(tmp2, rounding);
|
||||
tmp4 = _mm_add_epi32(tmp4, rounding);
|
||||
tmp6 = _mm_add_epi32(tmp6, rounding);
|
||||
tmp1 = _mm_add_epi32(tmp1, rounding);
|
||||
tmp3 = _mm_add_epi32(tmp3, rounding);
|
||||
tmp5 = _mm_add_epi32(tmp5, rounding);
|
||||
@ -2585,49 +2533,40 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
|
||||
|
||||
tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
|
||||
tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
|
||||
tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
|
||||
tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
|
||||
tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
|
||||
tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
|
||||
tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
|
||||
tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
|
||||
|
||||
stp2_0 = _mm_packs_epi32(tmp0, zero);
|
||||
stp2_1 = _mm_packs_epi32(tmp2, zero);
|
||||
stp2_2 = _mm_packs_epi32(tmp4, zero);
|
||||
stp2_3 = _mm_packs_epi32(tmp6, zero);
|
||||
stp2_9 = _mm_packs_epi32(tmp1, zero);
|
||||
stp2_14 = _mm_packs_epi32(tmp3, zero);
|
||||
stp2_10 = _mm_packs_epi32(tmp5, zero);
|
||||
stp2_13 = _mm_packs_epi32(tmp7, zero);
|
||||
stp1_0 = _mm_packs_epi32(tmp0, tmp0);
|
||||
stp1_1 = _mm_packs_epi32(tmp2, tmp2);
|
||||
stp2_9 = _mm_packs_epi32(tmp1, tmp3);
|
||||
stp2_10 = _mm_packs_epi32(tmp5, tmp7);
|
||||
|
||||
stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
|
||||
stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
|
||||
stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
|
||||
stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
|
||||
stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
|
||||
}
|
||||
|
||||
// Stage5 and Stage6
|
||||
{
|
||||
stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
|
||||
stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
|
||||
stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
|
||||
stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
|
||||
tmp0 = _mm_add_epi16(stp2_8, stp2_11);
|
||||
tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
|
||||
tmp2 = _mm_add_epi16(stp2_9, stp2_10);
|
||||
tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
|
||||
|
||||
stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
|
||||
stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
|
||||
stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
|
||||
stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
|
||||
stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
|
||||
stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
|
||||
stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
|
||||
stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
|
||||
|
||||
stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
|
||||
stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
|
||||
stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
|
||||
stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
|
||||
stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
|
||||
stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
|
||||
stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
|
||||
stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
|
||||
}
|
||||
|
||||
// Stage6
|
||||
{
|
||||
const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
|
||||
const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
|
||||
const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
|
||||
const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
|
||||
|
||||
@ -2652,21 +2591,26 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
|
||||
tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
|
||||
tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
|
||||
|
||||
stp1_5 = _mm_packs_epi32(tmp1, zero);
|
||||
stp1_6 = _mm_packs_epi32(tmp3, zero);
|
||||
stp1_6 = _mm_packs_epi32(tmp3, tmp1);
|
||||
|
||||
stp2_10 = _mm_packs_epi32(tmp0, zero);
|
||||
stp2_13 = _mm_packs_epi32(tmp2, zero);
|
||||
stp2_11 = _mm_packs_epi32(tmp4, zero);
|
||||
stp2_12 = _mm_packs_epi32(tmp6, zero);
|
||||
|
||||
stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
|
||||
stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
|
||||
stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
|
||||
stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
|
||||
stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
|
||||
stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
|
||||
stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
|
||||
stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
|
||||
tmp0 = _mm_add_epi16(stp1_0, stp1_4);
|
||||
tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
|
||||
tmp2 = _mm_add_epi16(stp1_1, stp1_6);
|
||||
tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
|
||||
|
||||
stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
|
||||
stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
|
||||
stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
|
||||
stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
|
||||
stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
|
||||
stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
|
||||
stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
|
||||
stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
|
||||
}
|
||||
|
||||
// Stage7. Left 8x16 only.
|
||||
@ -2687,7 +2631,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
|
||||
l[14] = _mm_sub_epi16(stp2_1, stp1_14);
|
||||
l[15] = _mm_sub_epi16(stp2_0, stp1_15);
|
||||
|
||||
// 2-D idct. We do 2 8x16 blocks.
|
||||
// Second 1-D inverse transform, performed per 8x16 block
|
||||
for (i = 0; i < 2; i++) {
|
||||
array_transpose_4X8(l + 8*i, in);
|
||||
in[8] = in[9] = in[10] = in[11] = in[12] = in[13] = in[14] = in[15] = zero;
|
||||
|
Loading…
Reference in New Issue
Block a user