SSE2 high precision 32x32 forward DCT

Enable SSE2 implementation of high precision 32x32 forward DCT. The
intermediate stacks are of 32-bits. The run-time goes down from
32126 cycles to 13442 cycles.

Change-Id: Ib5ccafe3176c65bd6f2dbdef790bd47bbc880e56
This commit is contained in:
Jingning Han 2013-08-07 14:45:37 -07:00
parent b89eef8f82
commit 78136edcdc
6 changed files with 1253 additions and 8 deletions

View File

@ -143,7 +143,7 @@ typedef struct {
unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
unsigned char segment_id; // Segment id for current frame
// Flags used for prediction status of various bistream signals
// Flags used for prediction status of various bit-stream signals
unsigned char seg_id_predicted;
// Indicates if the mb is part of the image (1) vs border (0)

View File

@ -27,6 +27,9 @@
#define pair_set_epi16(a, b) \
_mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))
#define pair_set_epi32(a, b) \
_mm_set_epi32(b, a, b, a)
// Constants:
// for (int i = 1; i< 32; ++i)
// printf("static const int cospi_%d_64 = %.0f;\n", i,

View File

@ -740,7 +740,7 @@ prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int p
specialize vp9_short_fdct8x4 sse2
prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct32x32
specialize vp9_short_fdct32x32 sse2
prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct32x32_rd sse2

View File

@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdio.h>
#include <math.h>
#include <limits.h>

File diff suppressed because it is too large Load Diff

View File

@ -2573,13 +2573,13 @@ void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
}
#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2
#define FDCT32x32_LOW_PRECISION 1
#define FDCT32x32_HIGH_PRECISION 0
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
#undef FDCT32x32_2D
#undef FDCT32x32_LOW_PRECISION
#undef FDCT32x32_HIGH_PRECISION
#define FDCT32x32_2D vp9_short_fdct32x32_sse2
#define FDCT32x32_LOW_PRECISION 0
#define FDCT32x32_HIGH_PRECISION 1
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
#undef FDCT32x32_2D
#undef FDCT32x32_LOW_PRECISION
#undef FDCT32x32_HIGH_PRECISION