Optimize 32x32 2D inverse DCT for speed-up

This commit exploits the sparsity of quantized coefficient matrix.
It detects each 32x8 array and skip the corresponding inverse
transformation if all entries are zero.

For ped1080p at 8000 kbps, this on average reduces the runtime of
32x32 inverse 2D-DCT SSE2 function from 6256 cycles -> 5200
cycles. It makes the overall encoding process about 2% faster at
speed 0. The speed-up is more pronounceable for the decoding process.

Change-Id: If20056c3566bd117642a76f8884c83e8bc8efbcf
This commit is contained in:
Jingning Han 2013-07-31 16:50:34 -07:00
parent 86c384d398
commit 9d67495f72

View File

@ -2813,6 +2813,12 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
} }
} }
#define LOAD_DQCOEFF(reg, input) \
{ \
reg = _mm_load_si128((__m128i *) input); \
input += 8; \
} \
void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5); const __m128i final_rounding = _mm_set1_epi16(1<<5);
@ -2880,48 +2886,126 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
stp2_30, stp2_31; stp2_30, stp2_31;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i, j; int i, j, i32;
__m128i zero_idx[16];
int zero_flag[2];
// We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
for (i = 0; i < 8; i++) { for (i = 0; i < 8; i++) {
i32 = (i << 5);
if (i < 4) { if (i < 4) {
// First 1-D idct // First 1-D idct
// Load input data. // Load input data.
in0 = _mm_load_si128((__m128i *)input); LOAD_DQCOEFF(in0, input);
in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); LOAD_DQCOEFF(in8, input);
in16 = _mm_load_si128((__m128i *)(input + 8 * 2)); LOAD_DQCOEFF(in16, input);
in24 = _mm_load_si128((__m128i *)(input + 8 * 3)); LOAD_DQCOEFF(in24, input);
in1 = _mm_load_si128((__m128i *)(input + 8 * 4)); LOAD_DQCOEFF(in1, input);
in9 = _mm_load_si128((__m128i *)(input + 8 * 5)); LOAD_DQCOEFF(in9, input);
in17 = _mm_load_si128((__m128i *)(input + 8 * 6)); LOAD_DQCOEFF(in17, input);
in25 = _mm_load_si128((__m128i *)(input + 8 * 7)); LOAD_DQCOEFF(in25, input);
in2 = _mm_load_si128((__m128i *)(input + 8 * 8)); LOAD_DQCOEFF(in2, input);
in10 = _mm_load_si128((__m128i *)(input + 8 * 9)); LOAD_DQCOEFF(in10, input);
in18 = _mm_load_si128((__m128i *)(input + 8 * 10)); LOAD_DQCOEFF(in18, input);
in26 = _mm_load_si128((__m128i *)(input + 8 * 11)); LOAD_DQCOEFF(in26, input);
in3 = _mm_load_si128((__m128i *)(input + 8 * 12)); LOAD_DQCOEFF(in3, input);
in11 = _mm_load_si128((__m128i *)(input + 8 * 13)); LOAD_DQCOEFF(in11, input);
in19 = _mm_load_si128((__m128i *)(input + 8 * 14)); LOAD_DQCOEFF(in19, input);
in27 = _mm_load_si128((__m128i *)(input + 8 * 15)); LOAD_DQCOEFF(in27, input);
in4 = _mm_load_si128((__m128i *)(input + 8 * 16)); LOAD_DQCOEFF(in4, input);
in12 = _mm_load_si128((__m128i *)(input + 8 * 17)); LOAD_DQCOEFF(in12, input);
in20 = _mm_load_si128((__m128i *)(input + 8 * 18)); LOAD_DQCOEFF(in20, input);
in28 = _mm_load_si128((__m128i *)(input + 8 * 19)); LOAD_DQCOEFF(in28, input);
in5 = _mm_load_si128((__m128i *)(input + 8 * 20)); LOAD_DQCOEFF(in5, input);
in13 = _mm_load_si128((__m128i *)(input + 8 * 21)); LOAD_DQCOEFF(in13, input);
in21 = _mm_load_si128((__m128i *)(input + 8 * 22)); LOAD_DQCOEFF(in21, input);
in29 = _mm_load_si128((__m128i *)(input + 8 * 23)); LOAD_DQCOEFF(in29, input);
in6 = _mm_load_si128((__m128i *)(input + 8 * 24)); LOAD_DQCOEFF(in6, input);
in14 = _mm_load_si128((__m128i *)(input + 8 * 25)); LOAD_DQCOEFF(in14, input);
in22 = _mm_load_si128((__m128i *)(input + 8 * 26)); LOAD_DQCOEFF(in22, input);
in30 = _mm_load_si128((__m128i *)(input + 8 * 27)); LOAD_DQCOEFF(in30, input);
in7 = _mm_load_si128((__m128i *)(input + 8 * 28)); LOAD_DQCOEFF(in7, input);
in15 = _mm_load_si128((__m128i *)(input + 8 * 29)); LOAD_DQCOEFF(in15, input);
in23 = _mm_load_si128((__m128i *)(input + 8 * 30)); LOAD_DQCOEFF(in23, input);
in31 = _mm_load_si128((__m128i *)(input + 8 * 31)); LOAD_DQCOEFF(in31, input);
input += 256; // checking if all entries are zero
zero_idx[0] = _mm_or_si128(in0, in1);
zero_idx[1] = _mm_or_si128(in2, in3);
zero_idx[2] = _mm_or_si128(in4, in5);
zero_idx[3] = _mm_or_si128(in6, in7);
zero_idx[4] = _mm_or_si128(in8, in9);
zero_idx[5] = _mm_or_si128(in10, in11);
zero_idx[6] = _mm_or_si128(in12, in13);
zero_idx[7] = _mm_or_si128(in14, in15);
zero_idx[8] = _mm_or_si128(in16, in17);
zero_idx[9] = _mm_or_si128(in18, in19);
zero_idx[10] = _mm_or_si128(in20, in21);
zero_idx[11] = _mm_or_si128(in22, in23);
zero_idx[12] = _mm_or_si128(in24, in25);
zero_idx[13] = _mm_or_si128(in26, in27);
zero_idx[14] = _mm_or_si128(in28, in29);
zero_idx[15] = _mm_or_si128(in30, in31);
zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
if (!zero_flag[0] && !zero_flag[1]) {
col[i32 + 0] = _mm_setzero_si128();
col[i32 + 1] = _mm_setzero_si128();
col[i32 + 2] = _mm_setzero_si128();
col[i32 + 3] = _mm_setzero_si128();
col[i32 + 4] = _mm_setzero_si128();
col[i32 + 5] = _mm_setzero_si128();
col[i32 + 6] = _mm_setzero_si128();
col[i32 + 7] = _mm_setzero_si128();
col[i32 + 8] = _mm_setzero_si128();
col[i32 + 9] = _mm_setzero_si128();
col[i32 + 10] = _mm_setzero_si128();
col[i32 + 11] = _mm_setzero_si128();
col[i32 + 12] = _mm_setzero_si128();
col[i32 + 13] = _mm_setzero_si128();
col[i32 + 14] = _mm_setzero_si128();
col[i32 + 15] = _mm_setzero_si128();
col[i32 + 16] = _mm_setzero_si128();
col[i32 + 17] = _mm_setzero_si128();
col[i32 + 18] = _mm_setzero_si128();
col[i32 + 19] = _mm_setzero_si128();
col[i32 + 20] = _mm_setzero_si128();
col[i32 + 21] = _mm_setzero_si128();
col[i32 + 22] = _mm_setzero_si128();
col[i32 + 23] = _mm_setzero_si128();
col[i32 + 24] = _mm_setzero_si128();
col[i32 + 25] = _mm_setzero_si128();
col[i32 + 26] = _mm_setzero_si128();
col[i32 + 27] = _mm_setzero_si128();
col[i32 + 28] = _mm_setzero_si128();
col[i32 + 29] = _mm_setzero_si128();
col[i32 + 30] = _mm_setzero_si128();
col[i32 + 31] = _mm_setzero_si128();
continue;
}
// Transpose 32x8 block to 8x32 block // Transpose 32x8 block to 8x32 block
TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
@ -3292,38 +3376,38 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
// final stage // final stage
if (i < 4) { if (i < 4) {
// 1_D: Store 32 intermediate results for each 8x32 block. // 1_D: Store 32 intermediate results for each 8x32 block.
col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31); col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30); col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29); col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28); col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27); col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26); col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25); col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24); col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23); col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22); col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21); col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20); col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19); col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18); col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17); col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16); col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
} else { } else {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();