SSE2 version of vp9_short_fdct32x32_rd.

43,000 -> 5,750 cycles, about 7.5x faster.

Change-Id: Ibfd92821b9603f4ed9c256e0ececec14fa4565d0
This commit is contained in:
Christian Duvivier 2013-06-18 15:23:25 -07:00
parent bc70c60b25
commit 466e0cf303
4 changed files with 1230 additions and 2 deletions

View File

@ -25,7 +25,10 @@
#define pair_set_epi16(a, b) \
_mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))
// Constants are round(16384 * cos(k*Pi/64)) where k = 1 to 31.
// Constants:
// for (int i = 1; i< 32; ++i)
// printf("static const int cospi_%d_64 = %.0f;\n", i,
// round(16384 * cos(i*M_PI/64)));
// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
static const int cospi_1_64 = 16364;
static const int cospi_2_64 = 16305;

View File

@ -600,7 +600,7 @@ prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int
specialize vp9_short_fdct32x32
prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct32x32_rd
specialize vp9_short_fdct32x32_rd sse2
prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct16x16 sse2

View File

@ -1366,6 +1366,9 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
temp_in[j] = input[j * shortpitch + i] << 2;
dct32_1d(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
// TODO(cd): see quality impact of only doing
// output[j * 32 + i] = (temp_out[j] + 1) >> 2;
// PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}

File diff suppressed because it is too large Load Diff