diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index dfb64c3a2..1c887bb6b 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -96,11 +96,15 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) { for (int i = 0; i < count_test_block; ++i) { int16_t test_input_block[16]; int16_t test_temp_block[16]; - int16_t test_output_block[16]; + uint8_t dst[16], src[16]; + for (int j = 0; j < 16; ++j) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + } // Initialize a test block with input range [-255, 255]. for (int j = 0; j < 16; ++j) - test_input_block[j] = rnd.Rand8() - rnd.Rand8(); + test_input_block[j] = src[j] - dst[j]; // TODO(Yaowu): this should be converted to a parameterized test // to test optimized versions of this function. @@ -120,10 +124,10 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) { } // Because the bitstream is not frozen yet, use the idct in the codebase. - vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch); + vp9_short_idct4x4_add_c(test_temp_block, dst, 4); for (int j = 0; j < 16; ++j) { - const int diff = test_input_block[j] - test_output_block[j]; + const int diff = dst[j] - src[j]; const int error = diff * diff; if (max_error < error) max_error = error; diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index b58945e51..26458e8a2 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -391,8 +391,8 @@ typedef struct macroblockd { int lossless; /* Inverse transform function pointers. */ - void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch); - void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch); + void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride); + void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride); void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob); void (*itxm_add_y_block)(int16_t *q, uint8_t *dst, int stride, struct macroblockd *xd); diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 2ff7696f8..80af49e84 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -18,12 +18,12 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" -void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) { +void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int i; + int16_t output[16]; int a1, b1, c1, d1; int16_t *ip = input; int16_t *op = output; - const int half_pitch = pitch >> 1; for (i = 0; i < 4; i++) { a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR; @@ -37,63 +37,60 @@ void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) { op[3] = (d1 - c1) >> 1; ip += 4; - op += half_pitch; + op += 4; } ip = output; - op = output; for (i = 0; i < 4; i++) { - a1 = ip[half_pitch * 0] + ip[half_pitch * 3]; - b1 = ip[half_pitch * 1] + ip[half_pitch * 2]; - c1 = ip[half_pitch * 1] - ip[half_pitch * 2]; - d1 = ip[half_pitch * 0] - ip[half_pitch * 3]; + a1 = ip[4 * 0] + ip[4 * 3]; + b1 = ip[4 * 1] + ip[4 * 2]; + c1 = ip[4 * 1] - ip[4 * 2]; + d1 = ip[4 * 0] - ip[4 * 3]; - op[half_pitch * 0] = (a1 + b1 + 1) >> 1; - op[half_pitch * 1] = (c1 + d1) >> 1; - op[half_pitch * 2] = (a1 - b1) >> 1; - op[half_pitch * 3] = (d1 - c1) >> 1; + dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + + ((a1 + b1 + 1) >> 1)); + dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + + ((c1 + d1) >> 1)); + dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + + ((a1 - b1) >> 1)); + dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + + ((d1 - c1) >> 1)); ip++; - op++; + dest++; } } -void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) { +void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) { int i; int16_t tmp[4]; int16_t *ip = in; int16_t *op = tmp; - const int half_pitch = pitch >> 1; op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1; op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1; ip = tmp; - op = out; for (i = 0; i < 4; i++) { - op[half_pitch * 0] = (ip[0] + 1) >> 1; - op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1; + dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + + ((ip[0] + 1) >> 1)); + dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + + (ip[0] >> 1)); + dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + + (ip[0] >> 1)); + dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + + (ip[0] >> 1)); ip++; - op++; + dest++; } } void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride) { - int r, c; int16_t dc = input_dc; - int16_t tmp[4 * 4]; - vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1); - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) - dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]); - - dst_ptr += stride; - pred_ptr += pitch; - } + vp9_short_iwalsh4x4_1_add_c(&dc, dst_ptr, stride); } void vp9_idct4_1d_c(int16_t *input, int16_t *output) { @@ -116,10 +113,9 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) { output[3] = step[0] - step[3]; } -void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) { +void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[4 * 4]; int16_t *outptr = out; - const int half_pitch = pitch >> 1; int i, j; int16_t temp_in[4], temp_out[4]; @@ -138,22 +134,24 @@ void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) { temp_in[j] = out[j * 4 + i]; vp9_idct4_1d(temp_in, temp_out); for (j = 0; j < 4; ++j) - output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4); + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * dest_stride + i]); } } -void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) { +void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int i; int a1; - int16_t *op = output; - const int half_pitch = pitch >> 1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 4); for (i = 0; i < 4; i++) { - op[0] = op[1] = op[2] = op[3] = a1; - op += half_pitch; + dest[0] = clip_pixel(dest[0] + a1); + dest[1] = clip_pixel(dest[1] + a1); + dest[2] = clip_pixel(dest[2] + a1); + dest[3] = clip_pixel(dest[3] + a1); + dest += dest_stride; } } @@ -285,8 +283,8 @@ static void iadst4_1d(int16_t *input, int16_t *output) { output[3] = dct_const_round_shift(s3); } -void vp9_short_iht4x4_c(int16_t *input, int16_t *output, - int pitch, int tx_type) { +void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride, + int tx_type) { const transform_2d IHT_4[] = { { vp9_idct4_1d, vp9_idct4_1d }, // DCT_DCT = 0 { iadst4_1d, vp9_idct4_1d }, // ADST_DCT = 1 @@ -312,10 +310,10 @@ void vp9_short_iht4x4_c(int16_t *input, int16_t *output, temp_in[j] = out[j * 4 + i]; IHT_4[tx_type].cols(temp_in, temp_out); for (j = 0; j < 4; ++j) - output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4); + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * dest_stride + i]); } } - static void iadst8_1d(int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c index 01859df5e..d47fca190 100644 --- a/vp9/common/vp9_invtrans.c +++ b/vp9/common/vp9_invtrans.c @@ -11,11 +11,10 @@ #include "vp9/common/vp9_invtrans.h" #include "./vp9_rtcd.h" -void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob, - int16_t *dqcoeff, int16_t *diff, - int pitch) { +void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff, + uint8_t *dest, int stride) { if (eob <= 1) - xd->inv_txm4x4_1(dqcoeff, diff, pitch); + xd->inv_txm4x4_1_add(dqcoeff, dest, stride); else - xd->inv_txm4x4(dqcoeff, diff, pitch); + xd->inv_txm4x4_add(dqcoeff, dest, stride); } diff --git a/vp9/common/vp9_invtrans.h b/vp9/common/vp9_invtrans.h index 2aeb584c9..dbdc50a2a 100644 --- a/vp9/common/vp9_invtrans.h +++ b/vp9/common/vp9_invtrans.h @@ -15,7 +15,6 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" -void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob, - int16_t *dqcoeff, int16_t *diff, - int pitch); +void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff, + uint8_t *dest, int stride); #endif // VP9_COMMON_VP9_INVTRANS_H_ diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index cf8dd33c4..cb353b1ed 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -85,9 +85,6 @@ prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLO specialize vp9_intra4x4_predict; if [ "$CONFIG_VP9_DECODER" = "yes" ]; then -prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride" -specialize vp9_add_residual_4x4 sse2 - prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride" specialize vp9_add_constant_residual_8x8 sse2 @@ -179,11 +176,11 @@ specialize vp9_convolve8_avg_vert ssse3 # # dct # -prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct4x4_1 +prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct4x4_1_add -prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct4x4 sse2 +prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct4x4_add sse2 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct8x8_add sse2 @@ -212,12 +209,12 @@ specialize vp9_short_idct1_32x32 prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct10_32x32_add +prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" +specialize vp9_short_iht4x4_add + prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" specialize vp9_short_iht8x8_add -prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type" -specialize vp9_short_iht4x4 - prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type" specialize vp9_short_iht16x16_add @@ -229,12 +226,11 @@ specialize vp9_idct4_1d sse2 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" specialize vp9_dc_only_idct_add sse2 -prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_iwalsh4x4_1 -prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_iwalsh4x4 -prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" -specialize vp9_dc_only_inv_walsh_add +prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_iwalsh4x4_1_add + +prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_iwalsh4x4_add prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad" specialize vp9_sad32x3 diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index ab8604c75..599dcff93 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -73,7 +73,7 @@ void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr, *(int *)dst_ptr = _mm_cvtsi128_si32(p1); } -void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) { +void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, @@ -81,7 +81,6 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) { (int16_t)cospi_24_64, (int16_t)-cospi_8_64, (int16_t)cospi_8_64, (int16_t)cospi_24_64); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const int half_pitch = pitch >> 1; __m128i input0, input1, input2, input3; // Rows @@ -188,14 +187,23 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) { input2 = _mm_srai_epi16(input2, 4); input3 = _mm_srai_epi16(input3, 4); - // Store results - _mm_storel_epi64((__m128i *)output, input2); - input2 = _mm_srli_si128(input2, 8); - _mm_storel_epi64((__m128i *)(output + half_pitch), input2); +#define RECON_AND_STORE4X4(dest, in_x) \ + { \ + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + *(int *)dest = _mm_cvtsi128_si32(d0); \ + dest += stride; \ + } - _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3); - input3 = _mm_srli_si128(input3, 8); - _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3); + input0 = _mm_srli_si128(input2, 8); + input1 = _mm_srli_si128(input3, 8); + + RECON_AND_STORE4X4(dest, input2); + RECON_AND_STORE4X4(dest, input0); + RECON_AND_STORE4X4(dest, input1); + RECON_AND_STORE4X4(dest, input3); } void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index 2fc6bd930..83fb8f9d9 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -1006,14 +1006,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { pc->uv_dc_delta_q == 0 && pc->uv_ac_delta_q == 0; if (xd->lossless) { - xd->inv_txm4x4_1 = vp9_short_iwalsh4x4_1; - xd->inv_txm4x4 = vp9_short_iwalsh4x4; xd->itxm_add = vp9_idct_add_lossless_c; xd->itxm_add_y_block = vp9_idct_add_y_block_lossless_c; xd->itxm_add_uv_block = vp9_idct_add_uv_block_lossless_c; } else { - xd->inv_txm4x4_1 = vp9_short_idct4x4_1; - xd->inv_txm4x4 = vp9_short_idct4x4; xd->itxm_add = vp9_idct_add; xd->itxm_add_y_block = vp9_idct_add_y_block; xd->itxm_add_uv_block = vp9_idct_add_uv_block; diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c index 10b585b3f..7726598bc 100644 --- a/vp9/decoder/vp9_idct_blk.c +++ b/vp9/decoder/vp9_idct_blk.c @@ -84,23 +84,6 @@ void vp9_idct_add_uv_block_lossless_c(int16_t *q, uint8_t *dst, int stride, } } -static void add_residual(const int16_t *diff, uint8_t *dest, int stride, - int width, int height) { - int r, c; - - for (r = 0; r < height; r++) { - for (c = 0; c < width; c++) - dest[c] = clip_pixel(diff[c] + dest[c]); - - dest += stride; - diff += width; - } -} - -void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) { - add_residual(diff, dest, stride, 4, 4); -} - static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride, int width, int height) { int r, c; @@ -133,11 +116,8 @@ void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride, if (tx_type == DCT_DCT) { vp9_idct_add(input, dest, stride, eob); } else { - DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); - - vp9_short_iht4x4(input, output, 4, tx_type); + vp9_short_iht4x4_add(input, dest, stride, tx_type); vpx_memset(input, 0, 32); - vp9_add_residual_4x4(output, dest, stride); } } @@ -154,13 +134,9 @@ void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, } void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) { - DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); - if (eob > 1) { - // the idct halves ( >> 1) the pitch - vp9_short_idct4x4(input, output, 4 << 1); + vp9_short_idct4x4_add(input, dest, stride); vpx_memset(input, 0, 32); - vp9_add_residual_4x4(output, dest, stride); } else { vp9_dc_only_idct_add(input[0], dest, dest, stride, stride); ((int *)input)[0] = 0; @@ -168,38 +144,27 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) { } void vp9_dc_idct_add_c(int16_t *input, uint8_t *dest, int stride, int dc) { - DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); - input[0] = dc; - - // the idct halves ( >> 1) the pitch - vp9_short_idct4x4(input, output, 4 << 1); + vp9_short_idct4x4_add(input, dest, stride); vpx_memset(input, 0, 32); - vp9_add_residual_4x4(output, dest, stride); } void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride, int eob) { - DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); - if (eob > 1) { - vp9_short_iwalsh4x4_c(input, output, 4 << 1); + vp9_short_iwalsh4x4_add(input, dest, stride); vpx_memset(input, 0, 32); - vp9_add_residual_4x4(output, dest, stride); } else { - vp9_dc_only_inv_walsh_add(input[0], dest, dest, stride, stride); + vp9_short_iwalsh4x4_1_add_c(input, dest, stride); ((int *)input)[0] = 0; } } void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride, int dc) { - DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); - input[0] = dc; - vp9_short_iwalsh4x4_c(input, output, 4 << 1); + vp9_short_iwalsh4x4_add(input, dest, stride); vpx_memset(input, 0, 32); - vp9_add_residual_4x4(output, dest, stride); } void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) { diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c index 72036c2d4..54ec67f24 100644 --- a/vp9/decoder/x86/vp9_dequantize_sse2.c +++ b/vp9/decoder/x86/vp9_dequantize_sse2.c @@ -15,49 +15,6 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" -void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) { - const int width = 4; - const __m128i zero = _mm_setzero_si128(); - - // Diff data - const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width)); - const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width)); - const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width)); - const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width)); - - // Prediction data. - __m128i p0 = _mm_cvtsi32_si128(*(const int *)(dest + 0 * stride)); - __m128i p1 = _mm_cvtsi32_si128(*(const int *)(dest + 1 * stride)); - __m128i p2 = _mm_cvtsi32_si128(*(const int *)(dest + 2 * stride)); - __m128i p3 = _mm_cvtsi32_si128(*(const int *)(dest + 3 * stride)); - - p0 = _mm_unpacklo_epi8(p0, zero); - p1 = _mm_unpacklo_epi8(p1, zero); - p2 = _mm_unpacklo_epi8(p2, zero); - p3 = _mm_unpacklo_epi8(p3, zero); - - p0 = _mm_add_epi16(p0, d0); - p1 = _mm_add_epi16(p1, d1); - p2 = _mm_add_epi16(p2, d2); - p3 = _mm_add_epi16(p3, d3); - - p0 = _mm_packus_epi16(p0, p1); - p2 = _mm_packus_epi16(p2, p3); - - *(int *)dest = _mm_cvtsi128_si32(p0); - dest += stride; - - p0 = _mm_srli_si128(p0, 8); - *(int *)dest = _mm_cvtsi128_si32(p0); - dest += stride; - - *(int *)dest = _mm_cvtsi128_si32(p2); - dest += stride; - - p2 = _mm_srli_si128(p2, 8); - *(int *)dest = _mm_cvtsi128_si32(p2); -} - void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest, int stride) { uint8_t abs_diff; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index e1ddb48d9..f3a03f3c8 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1169,8 +1169,8 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { if (lossless) { cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; - cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1; - cpi->mb.e_mbd.inv_txm4x4 = vp9_short_iwalsh4x4; + cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add; + cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add; cpi->mb.optimize = 0; cpi->common.filter_level = 0; cpi->zbin_mode_boost_enabled = 0; @@ -1178,8 +1178,8 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { } else { cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; - cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1; - cpi->mb.e_mbd.inv_txm4x4 = vp9_short_idct4x4; + cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add; + cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add; } } diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c index fe5bdb309..f8cf50f84 100644 --- a/vp9/encoder/vp9_encodeintra.c +++ b/vp9/encoder/vp9_encodeintra.c @@ -53,9 +53,6 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib, int16_t* const src_diff = raster_block_offset_int16(xd, bsize, 0, ib, x->plane[0].src_diff); - int16_t* const diff = - raster_block_offset_int16(xd, bsize, 0, ib, - xd->plane[0].diff); int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16); const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); @@ -72,17 +69,15 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib, if (tx_type != DCT_DCT) { vp9_short_fht4x4(src_diff, coeff, 4 << bwl, tx_type); x->quantize_b_4x4(x, ib, tx_type, 16); - vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), - diff, 4 << bwl, tx_type); + vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), dst, + xd->plane[0].dst.stride, tx_type); } else { x->fwd_txm4x4(src_diff, coeff, 8 << bwl); x->quantize_b_4x4(x, ib, tx_type, 16); - vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib], + vp9_inverse_transform_b_4x4_add(&x->e_mbd, xd->plane[0].eobs[ib], BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), - diff, 8 << bwl); + dst, xd->plane[0].dst.stride); } - - vp9_recon_b(dst, diff, 4 << bwl, dst, xd->plane[0].dst.stride); } void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) { diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index bbc97da61..84b350792 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -425,7 +425,6 @@ struct encode_b_args { VP9_COMMON *cm; MACROBLOCK *x; struct optimize_ctx *ctx; - int *wip_txfrm_size; // for "work in progress" only... will remove once done }; static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize, @@ -494,14 +493,9 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, int ss_txfrm_size, void *arg) { struct encode_b_args* const args = arg; MACROBLOCK* const x = args->x; - int *wip_txfrm_size = args->wip_txfrm_size; MACROBLOCKD* const xd = &x->e_mbd; - const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x); const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane, block, ss_txfrm_size); - int16_t* const diff = raster_block_offset_int16(xd, bsize, plane, - raster_block, - xd->plane[plane].diff); uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane, raster_block, xd->plane[plane].dst.buf, @@ -517,7 +511,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, case TX_32X32: vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), dst, xd->plane[plane].dst.stride); - *wip_txfrm_size = 32; break; case TX_16X16: tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT; @@ -529,7 +522,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, block, 16), dst, xd->plane[plane].dst.stride, tx_type); } - *wip_txfrm_size = 16; break; case TX_8X8: tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT; @@ -541,7 +533,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, block, 16), dst, xd->plane[plane].dst.stride, tx_type); } - *wip_txfrm_size = 8; break; case TX_4X4: tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT; @@ -549,13 +540,13 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - vp9_inverse_transform_b_4x4(xd, xd->plane[plane].eobs[block], - BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), diff, bw * 2); + vp9_inverse_transform_b_4x4_add(xd, xd->plane[plane].eobs[block], + BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), dst, + xd->plane[plane].dst.stride); } else { - vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), - diff, bw, tx_type); + vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), + dst, xd->plane[plane].dst.stride, tx_type); } - *wip_txfrm_size = 4; break; } } @@ -563,16 +554,15 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD* const xd = &x->e_mbd; - struct encode_b_args arg = {cm, x, NULL, NULL}; + struct encode_b_args arg = {cm, x, NULL}; - foreach_transformed_block_in_plane(xd, bsize, 0, - xform_quant, &arg); + foreach_transformed_block_in_plane(xd, bsize, 0, xform_quant, &arg); } void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD* const xd = &x->e_mbd; - struct encode_b_args arg = {cm, x, NULL, NULL}; + struct encode_b_args arg = {cm, x, NULL}; foreach_transformed_block_uv(xd, bsize, xform_quant, &arg); } @@ -581,61 +571,37 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD* const xd = &x->e_mbd; struct optimize_ctx ctx; - int wip_txfrm_size = 0; - struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size}; + struct encode_b_args arg = {cm, x, &ctx}; vp9_subtract_sby(x, bsize); if (x->optimize) vp9_optimize_init(xd, bsize, &ctx); - foreach_transformed_block_in_plane(xd, bsize, 0, - encode_block, &arg); - if (wip_txfrm_size < 8) - vp9_recon_sby(xd, bsize); + foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg); } void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD* const xd = &x->e_mbd; struct optimize_ctx ctx; - int wip_txfrm_size = 0; - struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size}; + struct encode_b_args arg = {cm, x, &ctx}; vp9_subtract_sbuv(x, bsize); if (x->optimize) vp9_optimize_init(xd, bsize, &ctx); foreach_transformed_block_uv(xd, bsize, encode_block, &arg); - - if (wip_txfrm_size < 8) - vp9_recon_sbuv(xd, bsize); } void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD* const xd = &x->e_mbd; struct optimize_ctx ctx; - int wip_txfrm_size = 0; - struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size}; + struct encode_b_args arg = {cm, x, &ctx}; vp9_subtract_sb(x, bsize); if (x->optimize) vp9_optimize_init(xd, bsize, &ctx); -#if 0 + foreach_transformed_block(xd, bsize, encode_block, &arg); - - vp9_recon_sb(xd, bsize); -#else - // wip version... will use foreach_transformed_block when done - foreach_transformed_block_in_plane(xd, bsize, 0, - encode_block, &arg); - if (wip_txfrm_size < 8) - vp9_recon_sby(xd, bsize); - wip_txfrm_size = 0; - - foreach_transformed_block_uv(xd, bsize, encode_block, &arg); - - if (wip_txfrm_size < 8) - vp9_recon_sbuv(xd, bsize); -#endif } diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index a144c1bd4..02d46cb7d 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -1178,11 +1178,11 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->oxcf.lossless = oxcf->lossless; if (cpi->oxcf.lossless) { - cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1; - cpi->mb.e_mbd.inv_txm4x4 = vp9_short_iwalsh4x4; + cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add; + cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add; } else { - cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1; - cpi->mb.e_mbd.inv_txm4x4 = vp9_short_idct4x4; + cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add; + cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add; } cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 78bbc3639..8c1ef4915 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -627,11 +627,6 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, BLOCK_SIZE_SB8X8, 0, ib, x->plane[0].src_diff); - int16_t* const diff = - raster_block_offset_int16(xd, - BLOCK_SIZE_SB8X8, - 0, ib, - xd->plane[0].diff); int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16); uint8_t* const dst = raster_block_offset_uint8(xd, @@ -703,18 +698,18 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, xd->mode_info_context->bmi[ib].as_mode.first = (B_PREDICTION_MODE)(*best_mode); - // inverse transform - if (best_tx_type != DCT_DCT) - vp9_short_iht4x4(best_dqcoeff, diff, 8, best_tx_type); - else - xd->inv_txm4x4(best_dqcoeff, diff, 16); - vp9_intra4x4_predict(xd, ib, BLOCK_SIZE_SB8X8, *best_mode, dst, xd->plane[0].dst.stride); - vp9_recon_b(dst, diff, 8, - dst, xd->plane[0].dst.stride); + + // inverse transform + if (best_tx_type != DCT_DCT) { + vp9_short_iht4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride, + best_tx_type); + } else { + xd->inv_txm4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride); + } return best_rd; }