Optimize 16x16 idct10 function

Wrote sse2 version of vp9_short_idct10_16x16 function. Compared
to c version, the sse2 version is 2.3X faster.

Change-Id: I314c4f09369648721798321eeed6f58e38857f26
This commit is contained in:
Yunqing Wang 2013-03-21 16:29:36 -07:00
parent 8a3233b54d
commit 869d6c0534
3 changed files with 729 additions and 363 deletions

View File

@ -298,7 +298,7 @@ prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct16x16 sse2
prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct10_16x16
specialize vp9_short_idct10_16x16 sse2
prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_16x16

File diff suppressed because it is too large Load Diff

View File

@ -315,7 +315,7 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
input[48] *= dq[1];
// the idct halves ( >> 1) the pitch
vp9_short_idct10_16x16_c(input, output, 32);
vp9_short_idct10_16x16(input, output, 32);
input[0] = input[1] = input[2] = input[3] = 0;
input[16] = input[17] = input[18] = 0;