vpx_fdctNxN_1_sse2: reduce store size

only output[0] needs to be set, store_output is more involved than a
movdqa in the high bitdepth case

Change-Id: I2cbd85d7cf74688bdf47eb767934fe42e02bff67
This commit is contained in:
James Zern 2016-04-01 12:35:33 -07:00
parent c98f8e04e5
commit 3735def667

View File

@ -40,7 +40,7 @@ void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
in1 = _mm_add_epi32(tmp, in0); in1 = _mm_add_epi32(tmp, in0);
in0 = _mm_slli_epi32(in1, 1); in0 = _mm_slli_epi32(in1, 1);
store_output(&in0, output); output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
} }
void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
@ -80,7 +80,7 @@ void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
in0 = _mm_srli_si128(sum, 8); in0 = _mm_srli_si128(sum, 8);
in1 = _mm_add_epi32(sum, in0); in1 = _mm_add_epi32(sum, in0);
store_output(&in1, output); output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
} }
void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
@ -149,7 +149,7 @@ void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
in1 = _mm_add_epi32(sum, in0); in1 = _mm_add_epi32(sum, in0);
in1 = _mm_srai_epi32(in1, 1); in1 = _mm_srai_epi32(in1, 1);
store_output(&in1, output); output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
} }
void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
@ -221,7 +221,7 @@ void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
in1 = _mm_add_epi32(sum, in0); in1 = _mm_add_epi32(sum, in0);
in1 = _mm_srai_epi32(in1, 3); in1 = _mm_srai_epi32(in1, 3);
store_output(&in1, output); output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
} }
#define DCT_HIGH_BIT_DEPTH 0 #define DCT_HIGH_BIT_DEPTH 0