SSE2 version of vp9_short_fdct32x32_rd.
43,000 -> 5,750 cycles, about 7.5x faster. Change-Id: Ibfd92821b9603f4ed9c256e0ececec14fa4565d0
This commit is contained in:
parent
bc70c60b25
commit
466e0cf303
@ -25,7 +25,10 @@
|
||||
#define pair_set_epi16(a, b) \
|
||||
_mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))
|
||||
|
||||
// Constants are round(16384 * cos(k*Pi/64)) where k = 1 to 31.
|
||||
// Constants:
|
||||
// for (int i = 1; i< 32; ++i)
|
||||
// printf("static const int cospi_%d_64 = %.0f;\n", i,
|
||||
// round(16384 * cos(i*M_PI/64)));
|
||||
// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
|
||||
static const int cospi_1_64 = 16364;
|
||||
static const int cospi_2_64 = 16305;
|
||||
|
@ -600,7 +600,7 @@ prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int
|
||||
specialize vp9_short_fdct32x32
|
||||
|
||||
prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
|
||||
specialize vp9_short_fdct32x32_rd
|
||||
specialize vp9_short_fdct32x32_rd sse2
|
||||
|
||||
prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
|
||||
specialize vp9_short_fdct16x16 sse2
|
||||
|
@ -1366,6 +1366,9 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
|
||||
temp_in[j] = input[j * shortpitch + i] << 2;
|
||||
dct32_1d(temp_in, temp_out, 0);
|
||||
for (j = 0; j < 32; ++j)
|
||||
// TODO(cd): see quality impact of only doing
|
||||
// output[j * 32 + i] = (temp_out[j] + 1) >> 2;
|
||||
// PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
|
||||
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
|
||||
}
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user