From cbb991b6b862a4c3b304a2a01261d5199ad480ce Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Mon, 12 Jun 2017 14:03:37 -0700 Subject: [PATCH 1/4] Convert 8x8 idct x86 macros to inline functions Change-Id: Id59865fd6c453a24121ce7160048d67875fc67ce --- vp9/common/x86/vp9_idct_intrin_sse2.c | 17 +- vpx_dsp/x86/inv_txfm_sse2.c | 1604 ++++++++++--------------- vpx_dsp/x86/inv_txfm_sse2.h | 163 +-- vpx_dsp/x86/inv_txfm_ssse3.c | 313 +++-- vpx_dsp/x86/transpose_sse2.h | 90 +- 5 files changed, 976 insertions(+), 1211 deletions(-) diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index bb2dcf52b..7e8089b51 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -54,7 +54,6 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[8]; - const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1 << 4); // load input data @@ -106,14 +105,14 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[6] = _mm_srai_epi16(in[6], 5); in[7] = _mm_srai_epi16(in[7], 5); - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); } void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index d22108497..b53505f7e 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -13,6 +13,14 @@ #include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" +static INLINE void transpose_16bit_4(__m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); + res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); +} + void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { const __m128i eight = _mm_set1_epi16(8); @@ -76,7 +84,7 @@ void idct4_sse2(__m128i *in) { const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); __m128i u[2]; - transpose_16bit_4x4(in); + transpose_16bit_4(in); // stage 1 u[0] = _mm_unpacklo_epi16(in[0], in[1]); u[1] = _mm_unpackhi_epi16(in[0], in[1]); @@ -99,7 +107,7 @@ void iadst4_sse2(__m128i *in) { const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u[8], v[8], in7; - transpose_16bit_4x4(in); + transpose_16bit_4(in); in7 = _mm_srli_si128(in[1], 8); in7 = _mm_add_epi16(in7, in[0]); in7 = _mm_sub_epi16(in7, in[1]); @@ -138,71 +146,35 @@ void iadst4_sse2(__m128i *in) { in[1] = _mm_packs_epi32(u[2], u[3]); } -#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ - { \ - res0 = idct_calc_wraplow_sse2(lo_0, hi_0, cst0); \ - res1 = idct_calc_wraplow_sse2(lo_0, hi_0, cst1); \ - } +// Multiply elements by constants and add them together. +static INLINE void multiplication_and_add( + const __m128i *const in0, const __m128i *const in1, + const __m128i *const in2, const __m128i *const in3, + const __m128i *const cst0, const __m128i *const cst1, + const __m128i *const cst2, const __m128i *const cst3, __m128i *const res0, + __m128i *const res1, __m128i *const res2, __m128i *const res3) { + const __m128i lo_0 = _mm_unpacklo_epi16(*in0, *in1); + const __m128i hi_0 = _mm_unpackhi_epi16(*in0, *in1); + const __m128i lo_1 = _mm_unpacklo_epi16(*in2, *in3); + const __m128i hi_1 = _mm_unpackhi_epi16(*in2, *in3); + *res0 = idct_calc_wraplow_sse2(lo_0, hi_0, *cst0); + *res1 = idct_calc_wraplow_sse2(lo_0, hi_0, *cst1); + *res2 = idct_calc_wraplow_sse2(lo_1, hi_1, *cst2); + *res3 = idct_calc_wraplow_sse2(lo_1, hi_1, *cst3); +} -#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \ - out4, out5, out6, out7) \ - { \ - /* Stage1 */ \ - { \ - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ - \ - MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \ - stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ - const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ - \ - MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - stp1_5 = idct_calc_wraplow_sse2(lo_56, hi_56, stg2_1); \ - stp1_6 = idct_calc_wraplow_sse2(lo_56, hi_56, stg2_0); \ - } \ - \ - /* Stage4 */ \ - out0 = _mm_add_epi16(stp1_0, stp2_7); \ - out1 = _mm_add_epi16(stp1_1, stp1_6); \ - out2 = _mm_add_epi16(stp1_2, stp1_5); \ - out3 = _mm_add_epi16(stp1_3, stp2_4); \ - out4 = _mm_sub_epi16(stp1_3, stp2_4); \ - out5 = _mm_sub_epi16(stp1_2, stp1_5); \ - out6 = _mm_sub_epi16(stp1_1, stp1_6); \ - out7 = _mm_sub_epi16(stp1_0, stp2_7); \ - } +static void multiplication_and_add_2(const __m128i *const in0, + const __m128i *const in1, + const __m128i *const cst0, + const __m128i *const cst1, + __m128i *const res0, __m128i *const res1) { + const __m128i lo = _mm_unpacklo_epi16(*in0, *in1); + const __m128i hi = _mm_unpackhi_epi16(*in0, *in1); + *res0 = idct_calc_wraplow_sse2(lo, hi, *cst0); + *res1 = idct_calc_wraplow_sse2(lo, hi, *cst1); +} -void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); +static INLINE void idct8(const __m128i *const in, __m128i *const out) { const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); @@ -211,66 +183,98 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + /* Stage1 */ + multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &stg1_0, &stg1_1, + &stg1_2, &stg1_3, &stp1_4, &stp1_7, &stp1_5, &stp1_6); + + /* Stage2 */ + multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &stg2_0, &stg2_1, + &stg2_2, &stg2_3, &stp2_0, &stp2_1, &stp2_2, &stp2_3); + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + + /* Stage3 */ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + multiplication_and_add_2(&stp2_6, &stp2_5, &stg2_1, &stg2_0, &stp1_5, + &stp1_6); + + /* Stage4 */ + out[0] = _mm_add_epi16(stp1_0, stp2_7); + out[1] = _mm_add_epi16(stp1_1, stp1_6); + out[2] = _mm_add_epi16(stp1_2, stp1_5); + out[3] = _mm_add_epi16(stp1_3, stp2_4); + out[4] = _mm_sub_epi16(stp1_3, stp2_4); + out[5] = _mm_sub_epi16(stp1_2, stp1_5); + out[6] = _mm_sub_epi16(stp1_1, stp1_6); + out[7] = _mm_sub_epi16(stp1_0, stp2_7); +} + +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + + __m128i in[8]; int i; // Load input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - in4 = load_input_data(input + 8 * 4); - in5 = load_input_data(input + 8 * 5); - in6 = load_input_data(input + 8 * 6); - in7 = load_input_data(input + 8 * 7); + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8 * 1); + in[2] = load_input_data(input + 8 * 2); + in[3] = load_input_data(input + 8 * 3); + in[4] = load_input_data(input + 8 * 4); + in[5] = load_input_data(input + 8 * 5); + in[6] = load_input_data(input + 8 * 6); + in[7] = load_input_data(input + 8 * 7); // 2-D for (i = 0; i < 2; i++) { // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); + transpose_16bit_8x8(in, in); // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, - in6, in7); + idct8(in, in); } // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); + in[0] = _mm_srai_epi16(in[0], 5); + in[1] = _mm_srai_epi16(in[1], 5); + in[2] = _mm_srai_epi16(in[2], 5); + in[3] = _mm_srai_epi16(in[3], 5); + in[4] = _mm_srai_epi16(in[4], 5); + in[5] = _mm_srai_epi16(in[5], 5); + in[6] = _mm_srai_epi16(in[6], 5); + in[7] = _mm_srai_epi16(in[7], 5); - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); } void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); int a; a = (int)dct_const_round_shift(input[0] * cospi_16_64); @@ -279,37 +283,22 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, dc_value = _mm_set1_epi16(a); - RECON_AND_STORE(dest + 0 * stride, dc_value); - RECON_AND_STORE(dest + 1 * stride, dc_value); - RECON_AND_STORE(dest + 2 * stride, dc_value); - RECON_AND_STORE(dest + 3 * stride, dc_value); - RECON_AND_STORE(dest + 4 * stride, dc_value); - RECON_AND_STORE(dest + 5 * stride, dc_value); - RECON_AND_STORE(dest + 6 * stride, dc_value); - RECON_AND_STORE(dest + 7 * stride, dc_value); + recon_and_store(dest + 0 * stride, dc_value); + recon_and_store(dest + 1 * stride, dc_value); + recon_and_store(dest + 2 * stride, dc_value); + recon_and_store(dest + 3 * stride, dc_value); + recon_and_store(dest + 4 * stride, dc_value); + recon_and_store(dest + 5 * stride, dc_value); + recon_and_store(dest + 6 * stride, dc_value); + recon_and_store(dest + 7 * stride, dc_value); } void idct8_sse2(__m128i *in) { - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0, - in1, in2, in3, in4, in5, in6, in7); + transpose_16bit_8x8(in, in); // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3], - in[4], in[5], in[6], in[7]); + idct8(in, in); } void iadst8_sse2(__m128i *in) { @@ -527,23 +516,23 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3; + __m128i in[8]; + __m128i stp1_2, stp1_3, stp1_4, stp1_5; + __m128i stp2_0, stp2_2, stp2_4, stp2_5, stp2_6; + __m128i tmp[4]; // Rows. Load 4-row input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8 * 1); + in[2] = load_input_data(input + 8 * 2); + in[3] = load_input_data(input + 8 * 3); // 8x4 Transpose - TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); + transpose_16bit_4x4(in, in); // Stage1 { - const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); - const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); + const __m128i lo_17 = _mm_unpackhi_epi16(in[0], zero); + const __m128i lo_35 = _mm_unpackhi_epi16(in[1], zero); stp1_4 = idct_calc_wraplow_sse2(stg1_0, stg1_1, lo_17); stp1_5 = idct_calc_wraplow_sse2(stg1_2, stg1_3, lo_35); @@ -551,273 +540,197 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, // Stage2 { - const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); - const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); + const __m128i lo_04 = _mm_unpacklo_epi16(in[0], zero); + const __m128i lo_26 = _mm_unpacklo_epi16(in[1], zero); stp2_0 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_04); stp2_2 = idct_calc_wraplow_sse2(stg2_3, stg2_2, lo_26); - tmp0 = _mm_add_epi16(stp1_4, stp1_5); - tmp1 = _mm_sub_epi16(stp1_4, stp1_5); + tmp[0] = _mm_add_epi16(stp1_4, stp1_5); + tmp[1] = _mm_sub_epi16(stp1_4, stp1_5); - stp2_4 = tmp0; - stp2_5 = _mm_unpacklo_epi64(tmp1, zero); - stp2_6 = _mm_unpackhi_epi64(tmp1, zero); + stp2_4 = tmp[0]; + stp2_5 = _mm_unpacklo_epi64(tmp[1], zero); + stp2_6 = _mm_unpackhi_epi64(tmp[1], zero); } // Stage3 { const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); - tmp0 = _mm_add_epi16(stp2_0, stp2_2); - tmp1 = _mm_sub_epi16(stp2_0, stp2_2); - stp1_2 = _mm_unpackhi_epi64(tmp1, tmp0); - stp1_3 = _mm_unpacklo_epi64(tmp1, tmp0); + tmp[0] = _mm_add_epi16(stp2_0, stp2_2); + tmp[1] = _mm_sub_epi16(stp2_0, stp2_2); + stp1_2 = _mm_unpackhi_epi64(tmp[1], tmp[0]); + stp1_3 = _mm_unpacklo_epi64(tmp[1], tmp[0]); stp1_5 = idct_calc_wraplow_sse2(stg3_0, stg2_0, lo_56); // stg3_1 = stg2_0 } // Stage4 - tmp0 = _mm_add_epi16(stp1_3, stp2_4); - tmp1 = _mm_add_epi16(stp1_2, stp1_5); - tmp2 = _mm_sub_epi16(stp1_3, stp2_4); - tmp3 = _mm_sub_epi16(stp1_2, stp1_5); + tmp[0] = _mm_add_epi16(stp1_3, stp2_4); + tmp[1] = _mm_add_epi16(stp1_2, stp1_5); + tmp[2] = _mm_sub_epi16(stp1_3, stp2_4); + tmp[3] = _mm_sub_epi16(stp1_2, stp1_5); - TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) + idct8x8_12_transpose_16bit_4x8(tmp, in); + in[4] = in[5] = in[6] = in[7] = zero; - IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4, - in5, in6, in7); + idct8(in, in); // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); + in[0] = _mm_srai_epi16(in[0], 5); + in[1] = _mm_srai_epi16(in[1], 5); + in[2] = _mm_srai_epi16(in[2], 5); + in[3] = _mm_srai_epi16(in[3], 5); + in[4] = _mm_srai_epi16(in[4], 5); + in[5] = _mm_srai_epi16(in[5], 5); + in[6] = _mm_srai_epi16(in[6], 5); + in[7] = _mm_srai_epi16(in[7], 5); - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); } -#define IDCT16 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ - const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ - const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ - const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ - const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \ - \ - MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \ - stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ - const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ - const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \ - stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - \ - stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - \ - stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ - const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ - const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1); \ - stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - } +#define IDCT16 \ + /* Stage2 */ \ + multiplication_and_add(&in[1], &in[15], &in[9], &in[7], &stg2_0, &stg2_1, \ + &stg2_2, &stg2_3, &stp2_8, &stp2_15, &stp2_9, \ + &stp2_14); \ + \ + multiplication_and_add(&in[5], &in[11], &in[13], &in[3], &stg2_4, &stg2_5, \ + &stg2_6, &stg2_7, &stp2_10, &stp2_13, &stp2_11, \ + &stp2_12); \ + \ + /* Stage3 */ \ + multiplication_and_add(&in[2], &in[14], &in[10], &in[6], &stg3_0, &stg3_1, \ + &stg3_2, &stg3_3, &stp1_4, &stp1_7, &stp1_5, \ + &stp1_6); \ + \ + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + \ + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + \ + /* Stage4 */ \ + multiplication_and_add(&in[0], &in[8], &in[4], &in[12], &stg4_0, &stg4_1, \ + &stg4_2, &stg4_3, &stp2_0, &stp2_1, &stp2_2, \ + &stp2_3); \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + multiplication_and_add(&stp1_9, &stp1_14, &stp1_10, &stp1_13, &stg4_4, \ + &stg4_5, &stg4_6, &stg4_7, &stp2_9, &stp2_14, \ + &stp2_10, &stp2_13); \ + \ + /* Stage5 */ \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + multiplication_and_add_2(&stp2_6, &stp2_5, &stg4_1, &stg4_0, &stp1_5, \ + &stp1_6); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + \ + /* Stage6 */ \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + multiplication_and_add(&stp1_10, &stp1_13, &stp1_11, &stp1_12, &stg6_0, \ + &stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13, \ + &stp2_11, &stp2_12); #define IDCT16_10 \ /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \ - stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \ - stp1_12_0) \ - } \ + multiplication_and_add(&in[1], &zero, &zero, &in[3], &stg2_0, &stg2_1, \ + &stg2_6, &stg2_7, &stp1_8_0, &stp1_15, &stp1_11, \ + &stp1_12_0); \ \ /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ + multiplication_and_add_2(&in[2], &zero, &stg3_0, &stg3_1, &stp2_4, &stp2_7); \ \ - MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \ - \ - stp1_9 = stp1_8_0; \ - stp1_10 = stp1_11; \ - \ - stp1_13 = stp1_12_0; \ - stp1_14 = stp1_15; \ - } \ + stp1_9 = stp1_8_0; \ + stp1_10 = stp1_11; \ + stp1_13 = stp1_12_0; \ + stp1_14 = stp1_15; \ \ /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ + multiplication_and_add_2(&in[0], &zero, &stg4_0, &stg4_1, &stp1_0, &stp1_1); \ + stp2_5 = stp2_4; \ + stp2_6 = stp2_7; \ \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \ - stp2_5 = stp2_4; \ - stp2_6 = stp2_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ + multiplication_and_add(&stp1_9, &stp1_14, &stp1_10, &stp1_13, &stg4_4, \ + &stg4_5, &stg4_6, &stg4_7, &stp2_9, &stp2_14, \ + &stp2_10, &stp2_13); \ \ /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + stp1_2 = stp1_1; \ + stp1_3 = stp1_0; \ + multiplication_and_add_2(&stp2_6, &stp2_5, &stg4_1, &stg4_0, &stp1_5, \ + &stp1_6); \ \ - stp1_2 = stp1_1; \ - stp1_3 = stp1_0; \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ \ - stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1); \ - stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ \ /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - } + multiplication_and_add(&stp1_10, &stp1_13, &stp1_11, &stp1_12, &stg6_0, \ + &stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13, \ + &stp2_11, &stp2_12); void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); @@ -930,7 +843,7 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, // Final rounding and shift in[j] = _mm_adds_epi16(in[j], final_rounding); in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); + recon_and_store(dest + j * stride, in[j]); } dest += 8; @@ -940,7 +853,6 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); int a, i; a = (int)dct_const_round_shift(input[0] * cospi_16_64); @@ -950,8 +862,8 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, dc_value = _mm_set1_epi16(a); for (i = 0; i < 16; ++i) { - RECON_AND_STORE(dest + 0, dc_value); - RECON_AND_STORE(dest + 8, dc_value); + recon_and_store(dest + 0, dc_value); + recon_and_store(dest + 8, dc_value); dest += stride; } } @@ -1494,10 +1406,8 @@ static void idct16_8col(__m128i *in) { t[4] = s[4]; t[7] = s[7]; - u[0] = _mm_unpacklo_epi16(s[5], s[6]); - u[1] = _mm_unpackhi_epi16(s[5], s[6]); - t[5] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_p16); - t[6] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16); + multiplication_and_add_2(&s[5], &s[6], &k__cospi_m16_p16, &k__cospi_p16_p16, + &t[5], &t[6]); t[8] = _mm_add_epi16(s[8], s[11]); t[9] = _mm_add_epi16(s[9], s[10]); @@ -1599,7 +1509,7 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, in[2] = load_input_data(input + 8 * 4); in[3] = load_input_data(input + 8 * 6); - TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); + transpose_16bit_4x4(in, in); // Stage2 { @@ -1732,7 +1642,7 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, // Final rounding and shift in[j] = _mm_adds_epi16(in[j], final_rounding); in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); + recon_and_store(dest + j * stride, in[j]); } dest += 8; @@ -1747,599 +1657,420 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, #define IDCT32_34 \ /* Stage1 */ \ - { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ - \ - const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ - \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \ - stp1_31); \ - MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \ - stp1_28); \ - MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \ - stp1_27); \ - MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \ - stp1_24); \ - } \ + multiplication_and_add_2(&in[1], &zero, &stg1_0, &stg1_1, &stp1_16, \ + &stp1_31); \ + multiplication_and_add_2(&zero, &in[7], &stg1_6, &stg1_7, &stp1_19, \ + &stp1_28); \ + multiplication_and_add_2(&in[5], &zero, &stg1_8, &stg1_9, &stp1_20, \ + &stp1_27); \ + multiplication_and_add_2(&zero, &in[3], &stg1_14, &stg1_15, &stp1_23, \ + &stp1_24); \ \ /* Stage2 */ \ - { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ + multiplication_and_add_2(&in[2], &zero, &stg2_0, &stg2_1, &stp2_8, \ + &stp2_15); \ + multiplication_and_add_2(&zero, &in[6], &stg2_6, &stg2_7, &stp2_11, \ + &stp2_12); \ \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ + stp2_16 = stp1_16; \ + stp2_19 = stp1_19; \ \ - MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \ - stp2_15); \ - MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \ - stp2_12); \ + stp2_20 = stp1_20; \ + stp2_23 = stp1_23; \ \ - stp2_16 = stp1_16; \ - stp2_19 = stp1_19; \ + stp2_24 = stp1_24; \ + stp2_27 = stp1_27; \ \ - stp2_20 = stp1_20; \ - stp2_23 = stp1_23; \ - \ - stp2_24 = stp1_24; \ - stp2_27 = stp1_27; \ - \ - stp2_28 = stp1_28; \ - stp2_31 = stp1_31; \ - } \ + stp2_28 = stp1_28; \ + stp2_31 = stp1_31; \ \ /* Stage3 */ \ - { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ + multiplication_and_add_2(&in[4], &zero, &stg3_0, &stg3_1, &stp1_4, &stp1_7); \ \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ + stp1_8 = stp2_8; \ + stp1_11 = stp2_11; \ + stp1_12 = stp2_12; \ + stp1_15 = stp2_15; \ \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ + multiplication_and_add(&stp1_16, &stp1_31, &stp1_19, &stp1_28, &stg3_4, \ + &stg3_5, &stg3_6, &stg3_4, &stp1_17, &stp1_30, \ + &stp1_18, &stp1_29); \ \ - MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \ - stp1_7); \ + multiplication_and_add(&stp1_20, &stp1_27, &stp1_23, &stp1_24, &stg3_8, \ + &stg3_9, &stg3_10, &stg3_8, &stp1_21, &stp1_26, \ + &stp1_22, &stp1_25); \ \ - stp1_8 = stp2_8; \ - stp1_11 = stp2_11; \ - stp1_12 = stp2_12; \ - stp1_15 = stp2_15; \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ \ /* Stage4 */ \ - { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ + multiplication_and_add_2(&in[0], &zero, &stg4_0, &stg4_1, &stp2_0, &stp2_1); \ \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ + stp2_4 = stp1_4; \ + stp2_5 = stp1_4; \ + stp2_6 = stp1_7; \ + stp2_7 = stp1_7; \ \ - MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \ - stp2_1); \ + multiplication_and_add(&stp2_8, &stp2_15, &stp2_11, &stp2_12, &stg4_4, \ + &stg4_5, &stg4_6, &stg4_4, &stp2_9, &stp2_14, \ + &stp2_10, &stp2_13); \ \ - stp2_4 = stp1_4; \ - stp2_5 = stp1_4; \ - stp2_6 = stp1_7; \ - stp2_7 = stp1_7; \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ \ /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + stp1_0 = stp2_0; \ + stp1_1 = stp2_1; \ + stp1_2 = stp2_1; \ + stp1_3 = stp2_0; \ + multiplication_and_add_2(&stp2_6, &stp2_5, &stg4_1, &stg4_0, &stp1_5, \ + &stp1_6); \ \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ \ - stp1_0 = stp2_0; \ - stp1_1 = stp2_1; \ - stp1_2 = stp2_1; \ - stp1_3 = stp2_0; \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ \ - stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1); \ - stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0); \ + multiplication_and_add(&stp2_18, &stp2_29, &stp2_19, &stp2_28, &stg4_4, \ + &stg4_5, &stg4_4, &stg4_5, &stp1_18, &stp1_29, \ + &stp1_19, &stp1_28); \ + multiplication_and_add(&stp2_20, &stp2_27, &stp2_21, &stp2_26, &stg4_6, \ + &stg4_4, &stg4_6, &stg4_4, &stp1_20, &stp1_27, \ + &stp1_21, &stp1_26); \ \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ \ /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ + multiplication_and_add(&stp1_10, &stp1_13, &stp1_11, &stp1_12, &stg6_0, \ + &stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13, \ + &stp2_11, &stp2_12); \ \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ \ /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + multiplication_and_add(&stp2_20, &stp2_27, &stp2_21, &stp2_26, &stg6_0, \ + &stg4_0, &stg6_0, &stg4_0, &stp1_20, &stp1_27, \ + &stp1_21, &stp1_26); \ + multiplication_and_add(&stp2_22, &stp2_25, &stp2_23, &stp2_24, &stg6_0, \ + &stg4_0, &stg6_0, &stg4_0, &stp1_22, &stp1_25, \ + &stp1_23, &stp1_24); \ \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; -#define IDCT32 \ - /* Stage1 */ \ - { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ - const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ - const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ - \ - const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ - const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ - const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ - const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ - const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ - \ - const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ - const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ - stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \ - stp1_30) \ - MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \ - stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ - stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ - stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ - const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ - const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ - \ - const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ - const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ - stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ - stp2_14) \ - MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ - stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ - stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ - stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ - \ - stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ - stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ - stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ - stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ - \ - stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ - stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ - stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ - const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ - const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - \ - MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ - stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ - stp1_6) \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ - const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ - const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1); \ - stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ - \ - /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } +#define IDCT32 \ + /* Stage1 */ \ + multiplication_and_add(&in[1], &in[31], &in[17], &in[15], &stg1_0, &stg1_1, \ + &stg1_2, &stg1_3, &stp1_16, &stp1_31, &stp1_17, \ + &stp1_30); \ + multiplication_and_add(&in[9], &in[23], &in[25], &in[7], &stg1_4, &stg1_5, \ + &stg1_6, &stg1_7, &stp1_18, &stp1_29, &stp1_19, \ + &stp1_28); \ + multiplication_and_add(&in[5], &in[27], &in[21], &in[11], &stg1_8, &stg1_9, \ + &stg1_10, &stg1_11, &stp1_20, &stp1_27, &stp1_21, \ + &stp1_26); \ + multiplication_and_add(&in[13], &in[19], &in[29], &in[3], &stg1_12, \ + &stg1_13, &stg1_14, &stg1_15, &stp1_22, &stp1_25, \ + &stp1_23, &stp1_24); \ + \ + /* Stage2 */ \ + multiplication_and_add(&in[2], &in[30], &in[18], &in[14], &stg2_0, &stg2_1, \ + &stg2_2, &stg2_3, &stp2_8, &stp2_15, &stp2_9, \ + &stp2_14); \ + multiplication_and_add(&in[10], &in[22], &in[26], &in[6], &stg2_4, &stg2_5, \ + &stg2_6, &stg2_7, &stp2_10, &stp2_13, &stp2_11, \ + &stp2_12); \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ + \ + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ + \ + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ + \ + /* Stage3 */ \ + multiplication_and_add(&in[4], &in[28], &in[20], &in[12], &stg3_0, &stg3_1, \ + &stg3_2, &stg3_3, &stp1_4, &stp1_7, &stp1_5, \ + &stp1_6); \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + \ + multiplication_and_add(&stp2_17, &stp2_30, &stp2_18, &stp2_29, &stg3_4, \ + &stg3_5, &stg3_6, &stg3_4, &stp1_17, &stp1_30, \ + &stp1_18, &stp1_29); \ + multiplication_and_add(&stp2_21, &stp2_26, &stp2_22, &stp2_25, &stg3_8, \ + &stg3_9, &stg3_10, &stg3_8, &stp1_21, &stp1_26, \ + &stp1_22, &stp1_25); \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ + \ + /* Stage4 */ \ + multiplication_and_add(&in[0], &in[16], &in[8], &in[24], &stg4_0, &stg4_1, \ + &stg4_2, &stg4_3, &stp2_0, &stp2_1, &stp2_2, \ + &stp2_3); \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + multiplication_and_add(&stp1_9, &stp1_14, &stp1_10, &stp1_13, &stg4_4, \ + &stg4_5, &stg4_6, &stg4_4, &stp2_9, &stp2_14, \ + &stp2_10, &stp2_13); \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ + \ + /* Stage5 */ \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + multiplication_and_add_2(&stp2_6, &stp2_5, &stg4_1, &stg4_0, &stp1_5, \ + &stp1_6); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + multiplication_and_add(&stp2_18, &stp2_29, &stp2_19, &stp2_28, &stg4_4, \ + &stg4_5, &stg4_4, &stg4_5, &stp1_18, &stp1_29, \ + &stp1_19, &stp1_28); \ + multiplication_and_add(&stp2_20, &stp2_27, &stp2_21, &stp2_26, &stg4_6, \ + &stg4_4, &stg4_6, &stg4_4, &stp1_20, &stp1_27, \ + &stp1_21, &stp1_26); \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ + \ + /* Stage6 */ \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + multiplication_and_add(&stp1_10, &stp1_13, &stp1_11, &stp1_12, &stg6_0, \ + &stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13, \ + &stp2_11, &stp2_12); \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ + \ + /* Stage7 */ \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + multiplication_and_add(&stp2_20, &stp2_27, &stp2_21, &stp2_26, &stg6_0, \ + &stg4_0, &stg6_0, &stg4_0, &stp1_20, &stp1_27, \ + &stp1_21, &stp1_26); \ + multiplication_and_add(&stp2_22, &stp2_25, &stp2_23, &stp2_24, &stg6_0, \ + &stg4_0, &stg6_0, &stg4_0, &stp1_22, &stp1_25, \ + &stp1_23, &stp1_24); \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; // Only upper-left 8x8 has non-zero coeff void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, @@ -2480,7 +2211,7 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, // Final rounding and shift in[j] = _mm_adds_epi16(in[j], final_rounding); in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); + recon_and_store(dest + j * stride, in[j]); } dest += 8; @@ -2752,7 +2483,7 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, // Final rounding and shift in[j] = _mm_adds_epi16(in[j], final_rounding); in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); + recon_and_store(dest + j * stride, in[j]); } dest += 8; @@ -2762,7 +2493,6 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); int a, j; a = (int)dct_const_round_shift(input[0] * cospi_16_64); @@ -2772,9 +2502,9 @@ void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, dc_value = _mm_set1_epi16(a); for (j = 0; j < 32; ++j) { - RECON_AND_STORE(dest + 0 + j * stride, dc_value); - RECON_AND_STORE(dest + 8 + j * stride, dc_value); - RECON_AND_STORE(dest + 16 + j * stride, dc_value); - RECON_AND_STORE(dest + 24 + j * stride, dc_value); + recon_and_store(dest + 0 + j * stride, dc_value); + recon_and_store(dest + 8 + j * stride, dc_value); + recon_and_store(dest + 16 + j * stride, dc_value); + recon_and_store(dest + 24 + j * stride, dc_value); } } diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index 9eead0915..e15a97e1e 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -46,45 +46,44 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); } -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ - const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ - out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ - out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ - out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - } -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - \ - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ - in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ - } +static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 30 31 32 33 00 01 02 03 + // in[1]: 20 21 22 23 10 11 12 13 + // in[2]: 40 41 42 43 70 71 72 73 + // in[3]: 50 51 52 53 60 61 62 63 + // to: + // tr0_0: 00 10 01 11 02 12 03 13 + // tr0_1: 20 30 21 31 22 32 23 33 + // tr0_2: 40 50 41 51 42 52 43 53 + // tr0_3: 60 70 61 71 62 72 63 73 + const __m128i tr0_0 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[1], in[0]); + const __m128i tr0_2 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[3], in[2]); + + // Unpack 32 bit elements resulting in: + // tr1_0: 00 10 20 30 01 11 21 31 + // tr1_1: 02 12 22 32 03 13 23 33 + // tr1_2: 40 50 60 70 41 51 61 71 + // tr1_3: 42 52 62 72 43 53 63 73 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); +} static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); @@ -151,7 +150,8 @@ static INLINE __m128i load_input_data(const tran_low_t *data) { #endif } -static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { +static INLINE void load_buffer_8x16(const tran_low_t *const input, + __m128i *const in) { in[0] = load_input_data(input + 0 * 16); in[1] = load_input_data(input + 1 * 16); in[2] = load_input_data(input + 2 * 16); @@ -171,18 +171,17 @@ static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { in[15] = load_input_data(input + 15 * 16); } -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - _mm_storel_epi64((__m128i *)(dest), d0); \ - } +static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); + d0 = _mm_unpacklo_epi8(d0, zero); + d0 = _mm_add_epi16(in_x, d0); + d0 = _mm_packus_epi16(d0, d0); + _mm_storel_epi64((__m128i *)(dest), d0); +} static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); // Final rounding and shift in[0] = _mm_adds_epi16(in[0], final_rounding); in[1] = _mm_adds_epi16(in[1], final_rounding); @@ -218,60 +217,24 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { in[14] = _mm_srai_epi16(in[14], 6); in[15] = _mm_srai_epi16(in[15], 6); - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); - RECON_AND_STORE(dest + 8 * stride, in[8]); - RECON_AND_STORE(dest + 9 * stride, in[9]); - RECON_AND_STORE(dest + 10 * stride, in[10]); - RECON_AND_STORE(dest + 11 * stride, in[11]); - RECON_AND_STORE(dest + 12 * stride, in[12]); - RECON_AND_STORE(dest + 13 * stride, in[13]); - RECON_AND_STORE(dest + 14 * stride, in[14]); - RECON_AND_STORE(dest + 15 * stride, in[15]); + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); + recon_and_store(dest + 8 * stride, in[8]); + recon_and_store(dest + 9 * stride, in[9]); + recon_and_store(dest + 10 * stride, in[10]); + recon_and_store(dest + 11 * stride, in[11]); + recon_and_store(dest + 12 * stride, in[12]); + recon_and_store(dest + 13 * stride, in[13]); + recon_and_store(dest + 14 * stride, in[14]); + recon_and_store(dest + 15 * stride, in[15]); } -#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \ - { \ - const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ - const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - } - -#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - } - -// Define Macro for multiplying elements by constants and adding them together. -#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \ - res0, res1, res2, res3) \ - { \ - res0 = idct_calc_wraplow_sse2(lo_0, hi_0, cst0); \ - res1 = idct_calc_wraplow_sse2(lo_0, hi_0, cst1); \ - res2 = idct_calc_wraplow_sse2(lo_1, hi_1, cst2); \ - res3 = idct_calc_wraplow_sse2(lo_1, hi_1, cst3); \ - } - static INLINE void recon_and_store4x4_sse2(const __m128i *const in, uint8_t *const dest, const int stride) { diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c index 4d2d95787..8c2be2cb6 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3.c +++ b/vpx_dsp/x86/inv_txfm_ssse3.c @@ -12,11 +12,11 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1 << 4); const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); @@ -28,36 +28,35 @@ void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in[8]; __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i; // Load input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - in4 = load_input_data(input + 8 * 4); - in5 = load_input_data(input + 8 * 5); - in6 = load_input_data(input + 8 * 6); - in7 = load_input_data(input + 8 * 7); + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8 * 1); + in[2] = load_input_data(input + 8 * 2); + in[3] = load_input_data(input + 8 * 3); + in[4] = load_input_data(input + 8 * 4); + in[5] = load_input_data(input + 8 * 5); + in[6] = load_input_data(input + 8 * 6); + in[7] = load_input_data(input + 8 * 7); // 2-D for (i = 0; i < 2; i++) { // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); + transpose_16bit_8x8(in, in); // 4-stage 1D idct8x8 { /* Stage1 */ { - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); + const __m128i lo_17 = _mm_unpacklo_epi16(in[1], in[7]); + const __m128i hi_17 = _mm_unpackhi_epi16(in[1], in[7]); + const __m128i lo_35 = _mm_unpacklo_epi16(in[3], in[5]); + const __m128i hi_35 = _mm_unpackhi_epi16(in[3], in[5]); { tmp0 = _mm_madd_epi16(lo_17, stg1_0); @@ -96,12 +95,12 @@ void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, /* Stage2 */ { - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); + const __m128i lo_26 = _mm_unpacklo_epi16(in[2], in[6]); + const __m128i hi_26 = _mm_unpackhi_epi16(in[2], in[6]); { - tmp0 = _mm_unpacklo_epi16(in0, in4); - tmp1 = _mm_unpackhi_epi16(in0, in4); + tmp0 = _mm_unpacklo_epi16(in[0], in[4]); + tmp1 = _mm_unpackhi_epi16(in[0], in[4]); tmp2 = _mm_madd_epi16(tmp0, stk2_0); tmp3 = _mm_madd_epi16(tmp1, stk2_0); @@ -176,44 +175,44 @@ void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, } /* Stage4 */ - in0 = _mm_add_epi16(stp1_0, stp2_7); - in1 = _mm_add_epi16(stp1_1, stp1_6); - in2 = _mm_add_epi16(stp1_2, stp1_5); - in3 = _mm_add_epi16(stp1_3, stp2_4); - in4 = _mm_sub_epi16(stp1_3, stp2_4); - in5 = _mm_sub_epi16(stp1_2, stp1_5); - in6 = _mm_sub_epi16(stp1_1, stp1_6); - in7 = _mm_sub_epi16(stp1_0, stp2_7); + in[0] = _mm_add_epi16(stp1_0, stp2_7); + in[1] = _mm_add_epi16(stp1_1, stp1_6); + in[2] = _mm_add_epi16(stp1_2, stp1_5); + in[3] = _mm_add_epi16(stp1_3, stp2_4); + in[4] = _mm_sub_epi16(stp1_3, stp2_4); + in[5] = _mm_sub_epi16(stp1_2, stp1_5); + in[6] = _mm_sub_epi16(stp1_1, stp1_6); + in[7] = _mm_sub_epi16(stp1_0, stp2_7); } } // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); + in[0] = _mm_srai_epi16(in[0], 5); + in[1] = _mm_srai_epi16(in[1], 5); + in[2] = _mm_srai_epi16(in[2], 5); + in[3] = _mm_srai_epi16(in[3], 5); + in[4] = _mm_srai_epi16(in[4], 5); + in[5] = _mm_srai_epi16(in[5], 5); + in[6] = _mm_srai_epi16(in[6], 5); + in[7] = _mm_srai_epi16(in[7], 5); - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); } void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, @@ -232,82 +231,82 @@ void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in[8]; __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3; + __m128i tmp[4]; // Rows. Load 4-row input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8 * 1); + in[2] = load_input_data(input + 8 * 2); + in[3] = load_input_data(input + 8 * 3); - // 8x4 Transpose - TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); + // 4x4 Transpose + transpose_16bit_4x4(in, in); // Stage1 - tmp0 = _mm_mulhrs_epi16(in0, stg1_0); - tmp1 = _mm_mulhrs_epi16(in0, stg1_1); - tmp2 = _mm_mulhrs_epi16(in1, stg1_2); - tmp3 = _mm_mulhrs_epi16(in1, stg1_3); + tmp[0] = _mm_mulhrs_epi16(in[0], stg1_0); + tmp[1] = _mm_mulhrs_epi16(in[0], stg1_1); + tmp[2] = _mm_mulhrs_epi16(in[1], stg1_2); + tmp[3] = _mm_mulhrs_epi16(in[1], stg1_3); - stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1); - stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3); + stp1_4 = _mm_unpackhi_epi64(tmp[0], tmp[1]); + stp1_5 = _mm_unpackhi_epi64(tmp[2], tmp[3]); // Stage2 - tmp0 = _mm_mulhrs_epi16(in0, stg2_0); - stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0); + tmp[0] = _mm_mulhrs_epi16(in[0], stg2_0); + stp2_0 = _mm_unpacklo_epi64(tmp[0], tmp[0]); - tmp1 = _mm_mulhrs_epi16(in1, stg2_2); - tmp2 = _mm_mulhrs_epi16(in1, stg2_3); - stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1); + tmp[1] = _mm_mulhrs_epi16(in[1], stg2_2); + tmp[2] = _mm_mulhrs_epi16(in[1], stg2_3); + stp2_2 = _mm_unpacklo_epi64(tmp[2], tmp[1]); - tmp0 = _mm_add_epi16(stp1_4, stp1_5); - tmp1 = _mm_sub_epi16(stp1_4, stp1_5); + tmp[0] = _mm_add_epi16(stp1_4, stp1_5); + tmp[1] = _mm_sub_epi16(stp1_4, stp1_5); - stp2_4 = tmp0; - stp2_5 = _mm_unpacklo_epi64(tmp1, zero); - stp2_6 = _mm_unpackhi_epi64(tmp1, zero); + stp2_4 = tmp[0]; + stp2_5 = _mm_unpacklo_epi64(tmp[1], zero); + stp2_6 = _mm_unpackhi_epi64(tmp[1], zero); - tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6); - tmp1 = _mm_madd_epi16(tmp0, stg3_0); - tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0 + tmp[0] = _mm_unpacklo_epi16(stp2_5, stp2_6); + tmp[1] = _mm_madd_epi16(tmp[0], stg3_0); + tmp[2] = _mm_madd_epi16(tmp[0], stk2_0); // stg3_1 = stk2_0 - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp[1] = _mm_add_epi32(tmp[1], rounding); + tmp[2] = _mm_add_epi32(tmp[2], rounding); + tmp[1] = _mm_srai_epi32(tmp[1], DCT_CONST_BITS); + tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS); - stp1_5 = _mm_packs_epi32(tmp1, tmp2); + stp1_5 = _mm_packs_epi32(tmp[1], tmp[2]); // Stage3 - tmp2 = _mm_add_epi16(stp2_0, stp2_2); - tmp3 = _mm_sub_epi16(stp2_0, stp2_2); + tmp[2] = _mm_add_epi16(stp2_0, stp2_2); + tmp[3] = _mm_sub_epi16(stp2_0, stp2_2); - stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2); - stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2); + stp1_2 = _mm_unpackhi_epi64(tmp[3], tmp[2]); + stp1_3 = _mm_unpacklo_epi64(tmp[3], tmp[2]); // Stage4 - tmp0 = _mm_add_epi16(stp1_3, stp2_4); - tmp1 = _mm_add_epi16(stp1_2, stp1_5); - tmp2 = _mm_sub_epi16(stp1_3, stp2_4); - tmp3 = _mm_sub_epi16(stp1_2, stp1_5); + tmp[0] = _mm_add_epi16(stp1_3, stp2_4); + tmp[1] = _mm_add_epi16(stp1_2, stp1_5); + tmp[2] = _mm_sub_epi16(stp1_3, stp2_4); + tmp[3] = _mm_sub_epi16(stp1_2, stp1_5); - TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) + idct8x8_12_transpose_16bit_4x8(tmp, in); /* Stage1 */ - stp1_4 = _mm_mulhrs_epi16(in1, stg1_0); - stp1_7 = _mm_mulhrs_epi16(in1, stg1_1); - stp1_5 = _mm_mulhrs_epi16(in3, stg1_2); - stp1_6 = _mm_mulhrs_epi16(in3, stg1_3); + stp1_4 = _mm_mulhrs_epi16(in[1], stg1_0); + stp1_7 = _mm_mulhrs_epi16(in[1], stg1_1); + stp1_5 = _mm_mulhrs_epi16(in[3], stg1_2); + stp1_6 = _mm_mulhrs_epi16(in[3], stg1_3); /* Stage2 */ - stp2_0 = _mm_mulhrs_epi16(in0, stg2_0); - stp2_1 = _mm_mulhrs_epi16(in0, stg2_0); + stp2_0 = _mm_mulhrs_epi16(in[0], stg2_0); + stp2_1 = _mm_mulhrs_epi16(in[0], stg2_0); - stp2_2 = _mm_mulhrs_epi16(in2, stg2_2); - stp2_3 = _mm_mulhrs_epi16(in2, stg2_3); + stp2_2 = _mm_mulhrs_epi16(in[2], stg2_2); + stp2_3 = _mm_mulhrs_epi16(in[2], stg2_3); stp2_4 = _mm_add_epi16(stp1_4, stp1_5); stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); @@ -320,62 +319,62 @@ void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); - tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); - tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); + tmp[0] = _mm_unpacklo_epi16(stp2_6, stp2_5); + tmp[1] = _mm_unpackhi_epi16(stp2_6, stp2_5); - tmp2 = _mm_madd_epi16(tmp0, stk2_0); - tmp3 = _mm_madd_epi16(tmp1, stk2_0); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - stp1_6 = _mm_packs_epi32(tmp2, tmp3); + tmp[2] = _mm_madd_epi16(tmp[0], stk2_0); + tmp[3] = _mm_madd_epi16(tmp[1], stk2_0); + tmp[2] = _mm_add_epi32(tmp[2], rounding); + tmp[3] = _mm_add_epi32(tmp[3], rounding); + tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS); + tmp[3] = _mm_srai_epi32(tmp[3], DCT_CONST_BITS); + stp1_6 = _mm_packs_epi32(tmp[2], tmp[3]); - tmp2 = _mm_madd_epi16(tmp0, stk2_1); - tmp3 = _mm_madd_epi16(tmp1, stk2_1); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - stp1_5 = _mm_packs_epi32(tmp2, tmp3); + tmp[2] = _mm_madd_epi16(tmp[0], stk2_1); + tmp[3] = _mm_madd_epi16(tmp[1], stk2_1); + tmp[2] = _mm_add_epi32(tmp[2], rounding); + tmp[3] = _mm_add_epi32(tmp[3], rounding); + tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS); + tmp[3] = _mm_srai_epi32(tmp[3], DCT_CONST_BITS); + stp1_5 = _mm_packs_epi32(tmp[2], tmp[3]); /* Stage4 */ - in0 = _mm_add_epi16(stp1_0, stp2_7); - in1 = _mm_add_epi16(stp1_1, stp1_6); - in2 = _mm_add_epi16(stp1_2, stp1_5); - in3 = _mm_add_epi16(stp1_3, stp2_4); - in4 = _mm_sub_epi16(stp1_3, stp2_4); - in5 = _mm_sub_epi16(stp1_2, stp1_5); - in6 = _mm_sub_epi16(stp1_1, stp1_6); - in7 = _mm_sub_epi16(stp1_0, stp2_7); + in[0] = _mm_add_epi16(stp1_0, stp2_7); + in[1] = _mm_add_epi16(stp1_1, stp1_6); + in[2] = _mm_add_epi16(stp1_2, stp1_5); + in[3] = _mm_add_epi16(stp1_3, stp2_4); + in[4] = _mm_sub_epi16(stp1_3, stp2_4); + in[5] = _mm_sub_epi16(stp1_2, stp1_5); + in[6] = _mm_sub_epi16(stp1_1, stp1_6); + in[7] = _mm_sub_epi16(stp1_0, stp2_7); // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); + in[0] = _mm_srai_epi16(in[0], 5); + in[1] = _mm_srai_epi16(in[1], 5); + in[2] = _mm_srai_epi16(in[2], 5); + in[3] = _mm_srai_epi16(in[3], 5); + in[4] = _mm_srai_epi16(in[4], 5); + in[5] = _mm_srai_epi16(in[5], 5); + in[6] = _mm_srai_epi16(in[6], 5); + in[7] = _mm_srai_epi16(in[7], 5); - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); } // Only do addition and subtraction butterfly, size = 16, 32 @@ -618,7 +617,6 @@ static void idct32_34_second_half(const __m128i *in, __m128i *stp1) { // Only upper-left 8x8 has non-zero coeff void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1 << 5); __m128i in[32], col[32]; __m128i stp1[32]; @@ -653,7 +651,7 @@ void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, // Final rounding and shift in[j] = _mm_adds_epi16(in[j], final_rounding); in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); + recon_and_store(dest + j * stride, in[j]); } dest += 8; @@ -954,7 +952,6 @@ static void idct32_8x32_135(__m128i *in /*in[32]*/) { static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); int j = 0; while (j < 32) { in[j] = _mm_adds_epi16(in[j], final_rounding); @@ -963,16 +960,16 @@ static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { in[j] = _mm_srai_epi16(in[j], 6); in[j + 1] = _mm_srai_epi16(in[j + 1], 6); - RECON_AND_STORE(dst, in[j]); + recon_and_store(dst, in[j]); dst += stride; - RECON_AND_STORE(dst, in[j + 1]); + recon_and_store(dst, in[j + 1]); dst += stride; j += 2; } } -static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest, - int stride) { +static INLINE void recon_and_store_ssse3(__m128i *in0, __m128i *in1, + uint8_t *dest, int stride) { store_buffer_8x32(in0, dest, stride); store_buffer_8x32(in1, dest + 8, stride); } @@ -1022,11 +1019,11 @@ void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, // rows transpose_and_copy_16x16(col0, col1, temp, left_16); idct32_135(col0, col1); - recon_and_store(col0, col1, dest, stride); + recon_and_store_ssse3(col0, col1, dest, stride); transpose_and_copy_16x16(col0, col1, temp, right_16); idct32_135(col0, col1); - recon_and_store(col0, col1, dest + 16, stride); + recon_and_store_ssse3(col0, col1, dest + 16, stride); } // For each 8x32 block __m128i in[32], diff --git a/vpx_dsp/x86/transpose_sse2.h b/vpx_dsp/x86/transpose_sse2.h index a5e40245a..bec59f5f9 100644 --- a/vpx_dsp/x86/transpose_sse2.h +++ b/vpx_dsp/x86/transpose_sse2.h @@ -12,15 +12,91 @@ #define VPX_DSP_X86_TRANSPOSE_SSE2_H_ #include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/x86/inv_txfm_sse2.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" -static INLINE void transpose_16bit_4x4(__m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); +static INLINE void transpose_16bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // to: + // tr0_0: 00 10 01 11 02 12 03 13 + // tr0_1: 20 30 21 31 22 32 23 33 + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); - res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); + // Unpack 32 bit elements resulting in: + // out[0]: 00 10 20 30 01 11 21 31 + // out[1]: 02 12 22 32 03 13 23 33 + out[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); + out[1] = _mm_unpackhi_epi32(tr0_0, tr0_1); +} + +static INLINE void transpose_16bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // tr0_0: 00 10 01 11 02 12 03 13 + // tr0_1: 20 30 21 31 22 32 23 33 + // tr0_2: 40 50 41 51 42 52 43 53 + // tr0_3: 60 70 61 71 62 72 63 73 + // tr0_4: 04 14 05 15 06 16 07 17 + // tr0_5: 24 34 25 35 26 36 27 37 + // tr0_6: 44 54 45 55 46 56 47 57 + // tr0_7: 64 74 65 75 66 76 67 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_3 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i tr0_4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_5 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // tr1_0: 00 10 20 30 01 11 21 31 + // tr1_1: 40 50 60 70 41 51 61 71 + // tr1_2: 04 14 24 34 05 15 25 35 + // tr1_3: 44 54 64 74 45 55 65 75 + // tr1_4: 02 12 22 32 03 13 23 33 + // tr1_5: 42 52 62 72 43 53 63 73 + // tr1_6: 06 16 26 36 07 17 27 37 + // tr1_7: 46 56 66 76 47 57 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_3 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_4 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_5 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + out[2] = _mm_unpacklo_epi64(tr1_4, tr1_5); + out[3] = _mm_unpackhi_epi64(tr1_4, tr1_5); + out[4] = _mm_unpacklo_epi64(tr1_2, tr1_3); + out[5] = _mm_unpackhi_epi64(tr1_2, tr1_3); + out[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); + out[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); } static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1, From 9c72e85e4cfc87a4346701139bc25a56d43761c0 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Mon, 12 Jun 2017 15:45:50 -0700 Subject: [PATCH 2/4] Remove array_transpose_8x8() in x86 Duplicate of transpose_16bit_8x8() Change-Id: Iaa5dd63b5cccb044974a65af22c90e13418e311f --- vp9/encoder/x86/vp9_dct_intrin_sse2.c | 65 +++---------------------- vpx_dsp/x86/highbd_idct16x16_add_sse2.c | 4 +- vpx_dsp/x86/highbd_idct8x8_add_sse2.c | 2 +- vpx_dsp/x86/inv_txfm_sse2.c | 30 ++++++------ vpx_dsp/x86/inv_txfm_sse2.h | 39 ++------------- vpx_dsp/x86/inv_txfm_ssse3.c | 28 +++++------ 6 files changed, 44 insertions(+), 124 deletions(-) diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c index 09a1e48fc..beb2695ab 100644 --- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -15,6 +15,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/x86/fwd_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" #include "vpx_ports/mem.h" @@ -706,58 +707,6 @@ static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, store_output(&res[7], (output + 7 * stride)); } -// perform in-place transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 44 54 45 55 46 56 47 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 25 35 - // 44 54 64 74 45 55 65 75 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 -} - static void fdct8_sse2(__m128i *in) { // constants const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); @@ -895,7 +844,7 @@ static void fdct8_sse2(__m128i *in) { in[7] = _mm_packs_epi32(v6, v7); // transpose - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); } static void fadst8_sse2(__m128i *in) { @@ -1125,7 +1074,7 @@ static void fadst8_sse2(__m128i *in) { in[7] = _mm_sub_epi16(k__const_0, s1); // transpose - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); } void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, @@ -1184,10 +1133,10 @@ static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); + transpose_16bit_8x8(res0, res0); + transpose_16bit_8x8(res1, tbuf); + transpose_16bit_8x8(res0 + 8, res1); + transpose_16bit_8x8(res1 + 8, res1 + 8); res0[8] = tbuf[0]; res0[9] = tbuf[1]; diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c index a2412d124..c2b7262ad 100644 --- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c @@ -182,8 +182,8 @@ void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, if (test) { // Use fact only first 4 rows contain non-zero coeffs - array_transpose_8x8(inptr, inptr); - array_transpose_8x8(inptr + 8, inptr + 16); + transpose_16bit_8x8(inptr, inptr); + transpose_16bit_8x8(inptr + 8, inptr + 16); for (i = 0; i < 4; i++) { sign_bits = _mm_cmplt_epi16(inptr[i], zero); temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c index 29cc1d30e..e98d547ee 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -63,7 +63,7 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, test = _mm_movemask_epi8(temp1); if (test) { - array_transpose_8x8(inptr, inptr); + transpose_16bit_8x8(inptr, inptr); for (i = 0; i < 8; i++) { sign_bits = _mm_cmplt_epi16(inptr[i], zero); temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index b53505f7e..4ff77b381 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -325,7 +325,7 @@ void iadst8_sse2(__m128i *in) { __m128i in0, in1, in2, in3, in4, in5, in6, in7; // transpose - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); // properly aligned for butterfly input in0 = in[7]; @@ -787,8 +787,8 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, in[7] = load_input_data(input + 8 * 14); in[15] = load_input_data(input + 8 * 15); - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); + transpose_16bit_8x8(in, in); + transpose_16bit_8x8(in + 8, in + 8); IDCT16 @@ -816,8 +816,8 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, for (i = 0; i < 2; i++) { int j; // 1-D idct - array_transpose_8x8(l + i * 8, in); - array_transpose_8x8(r + i * 8, in + 8); + transpose_16bit_8x8(l + i * 8, in); + transpose_16bit_8x8(r + i * 8, in + 8); IDCT16 @@ -2131,7 +2131,7 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, in[6] = load_input_data(input + 192); in[7] = load_input_data(input + 224); - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); IDCT32_34 // 1_D: Store 32 intermediate results for each 8x32 block. @@ -2170,7 +2170,7 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, for (i = 0; i < 4; i++) { int j; // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + i * 8, in); + transpose_16bit_8x8(col + i * 8, in); IDCT32_34 // 2_D: Calculate the results and store them to destination. @@ -2392,10 +2392,10 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, } // Transpose 32x8 block to 8x32 block - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); + transpose_16bit_8x8(in, in); + transpose_16bit_8x8(in + 8, in + 8); + transpose_16bit_8x8(in + 16, in + 16); + transpose_16bit_8x8(in + 24, in + 24); IDCT32 @@ -2438,10 +2438,10 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, j = i << 3; // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + j, in); - array_transpose_8x8(col + j + 32, in + 8); - array_transpose_8x8(col + j + 64, in + 16); - array_transpose_8x8(col + j + 96, in + 24); + transpose_16bit_8x8(col + j, in); + transpose_16bit_8x8(col + j + 32, in + 8); + transpose_16bit_8x8(col + j + 64, in + 16); + transpose_16bit_8x8(col + j + 96, in + 24); IDCT32 diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index e15a97e1e..40fb9511c 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -15,38 +15,9 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -// perform 8x8 transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); -} - static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in, __m128i *const out) { // Unpack 16 bit elements. Goes from: @@ -104,10 +75,10 @@ static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); + transpose_16bit_8x8(res0, res0); + transpose_16bit_8x8(res1, tbuf); + transpose_16bit_8x8(res0 + 8, res1); + transpose_16bit_8x8(res1 + 8, res1 + 8); res0[8] = tbuf[0]; res0[9] = tbuf[1]; diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c index 8c2be2cb6..94504e478 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3.c +++ b/vpx_dsp/x86/inv_txfm_ssse3.c @@ -632,7 +632,7 @@ void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, in[6] = load_input_data(input + 192); in[7] = load_input_data(input + 224); - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); idct32_34_first_half(in, stp1); idct32_34_second_half(in, stp1); @@ -641,7 +641,7 @@ void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, for (i = 0; i < 4; i++) { int j; // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + i * 8, in); + transpose_16bit_8x8(col + i * 8, in); idct32_34_first_half(in, stp1); idct32_34_second_half(in, stp1); @@ -672,10 +672,10 @@ static void load_buffer_16x16(const tran_low_t *input, __m128i *in0, static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0, __m128i *out1) { - array_transpose_8x8(in0, out0); - array_transpose_8x8(&in0[8], out1); - array_transpose_8x8(in1, &out0[8]); - array_transpose_8x8(&in1[8], &out1[8]); + transpose_16bit_8x8(in0, out0); + transpose_16bit_8x8(&in0[8], out1); + transpose_16bit_8x8(in1, &out0[8]); + transpose_16bit_8x8(&in1[8], &out1[8]); } // Group the coefficient calculation into smaller functions @@ -1306,10 +1306,10 @@ void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, input += 32 << 3; // Transpose 32x8 block to 8x32 block - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); + transpose_16bit_8x8(in, in); + transpose_16bit_8x8(in + 8, in + 8); + transpose_16bit_8x8(in + 16, in + 16); + transpose_16bit_8x8(in + 24, in + 24); idct32_full_8x32(in, col + (i << 5)); } @@ -1318,10 +1318,10 @@ void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, for (i = 0; i < 4; ++i) { j = i << 3; // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + j, in); - array_transpose_8x8(col + j + 32, in + 8); - array_transpose_8x8(col + j + 64, in + 16); - array_transpose_8x8(col + j + 96, in + 24); + transpose_16bit_8x8(col + j, in); + transpose_16bit_8x8(col + j + 32, in + 8); + transpose_16bit_8x8(col + j + 64, in + 16); + transpose_16bit_8x8(col + j + 96, in + 24); idct32_full_8x32(in, in); store_buffer_8x32(in, dest, stride); From d6eeef9ee6324af69a9fb19b1c507c29700ac28f Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Mon, 12 Jun 2017 16:23:53 -0700 Subject: [PATCH 3/4] Clean array_transpose_{4X8,16x16,16x16_2) in x86 Change-Id: I341399ecbde37065375ea7e63511a26bfc285ea0 --- vp9/encoder/x86/vp9_dct_intrin_sse2.c | 21 +-------- vpx_dsp/x86/highbd_idct16x16_add_sse2.c | 2 +- vpx_dsp/x86/highbd_idct8x8_add_sse2.c | 2 +- vpx_dsp/x86/inv_txfm_sse2.c | 6 +-- vpx_dsp/x86/inv_txfm_sse2.h | 34 -------------- vpx_dsp/x86/inv_txfm_ssse3.c | 17 +++---- vpx_dsp/x86/transpose_sse2.h | 61 +++++++++++++++++++++++++ 7 files changed, 74 insertions(+), 69 deletions(-) diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c index beb2695ab..969c60aba 100644 --- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -1131,23 +1131,6 @@ static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, write_buffer_8x8(output + 8 * stride, in1 + 8, stride); } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - transpose_16bit_8x8(res0, res0); - transpose_16bit_8x8(res1, tbuf); - transpose_16bit_8x8(res0 + 8, res1); - transpose_16bit_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { // perform rounding operations right_shift_8x8(res0, 2); @@ -1951,13 +1934,13 @@ static void fadst16_8col(__m128i *in) { static void fdct16_sse2(__m128i *in0, __m128i *in1) { fdct16_8col(in0); fdct16_8col(in1); - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); } static void fadst16_sse2(__m128i *in0, __m128i *in1) { fadst16_8col(in0); fadst16_8col(in1); - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); } void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c index c2b7262ad..1df91f08f 100644 --- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c @@ -66,7 +66,7 @@ void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, test = _mm_movemask_epi8(temp1); if (test) { - array_transpose_16x16(inptr, inptr + 16); + transpose_16bit_16x16(inptr, inptr + 16); for (i = 0; i < 16; i++) { sign_bits = _mm_cmplt_epi16(inptr[i], zero); temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c index e98d547ee..c12e3e1b9 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -165,7 +165,7 @@ void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, if (test) { // Use fact only first 4 rows contain non-zero coeffs - array_transpose_4X8(inptr, inptr); + transpose_16bit_4x8(inptr, inptr); for (i = 0; i < 4; i++) { sign_bits = _mm_cmplt_epi16(inptr[i], zero); temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index 4ff77b381..32f1b63b8 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -1462,13 +1462,13 @@ static void idct16_8col(__m128i *in) { } void idct16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); idct16_8col(in0); idct16_8col(in1); } void iadst16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); iadst16_8col(in0); iadst16_8col(in1); } @@ -1616,7 +1616,7 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, // Second 1-D inverse transform, performed per 8x16 block for (i = 0; i < 2; i++) { int j; - array_transpose_4X8(l + 8 * i, in); + transpose_16bit_4x8(l + 8 * i, in); IDCT16_10 diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index 40fb9511c..bf86afd3c 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -56,40 +56,6 @@ static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in, out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); } -static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - - out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); - out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); - out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); - out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); -} - -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - transpose_16bit_8x8(res0, res0); - transpose_16bit_8x8(res1, tbuf); - transpose_16bit_8x8(res0 + 8, res1); - transpose_16bit_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) { const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING)); return _mm_srai_epi32(t, DCT_CONST_BITS); diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c index 94504e478..3ea43769f 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3.c +++ b/vpx_dsp/x86/inv_txfm_ssse3.c @@ -670,14 +670,6 @@ static void load_buffer_16x16(const tran_low_t *input, __m128i *in0, } } -static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0, - __m128i *out1) { - transpose_16bit_8x8(in0, out0); - transpose_16bit_8x8(&in0[8], out1); - transpose_16bit_8x8(in1, &out0[8]); - transpose_16bit_8x8(&in1[8], &out1[8]); -} - // Group the coefficient calculation into smaller functions // to prevent stack spillover: // quarter_1: 0-7 @@ -986,7 +978,7 @@ static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store, switch (cols) { case left_16: { int i; - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); for (i = 0; i < 16; ++i) { store[i] = in0[16 + i]; store[16 + i] = in1[16 + i]; @@ -994,7 +986,10 @@ static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store, break; } case right_16: { - array_transpose_16x16_2(store, &store[16], in0, in1); + transpose_16bit_8x8(store, in0); + transpose_16bit_8x8(&store[8], in1); + transpose_16bit_8x8(&store[16], &in0[8]); + transpose_16bit_8x8(&store[24], &in1[8]); break; } default: { assert(0); } @@ -1013,7 +1008,7 @@ void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, load_buffer_16x16(input, col0, col1); // columns - array_transpose_16x16(col0, col1); + transpose_16bit_16x16(col0, col1); idct32_135(col0, col1); // rows diff --git a/vpx_dsp/x86/transpose_sse2.h b/vpx_dsp/x86/transpose_sse2.h index bec59f5f9..cac007474 100644 --- a/vpx_dsp/x86/transpose_sse2.h +++ b/vpx_dsp/x86/transpose_sse2.h @@ -33,6 +33,48 @@ static INLINE void transpose_16bit_4x4(const __m128i *const in, out[1] = _mm_unpackhi_epi32(tr0_0, tr0_1); } +static INLINE void transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // in[4]: 40 41 42 43 XX XX XX XX + // in[5]: 50 51 52 53 XX XX XX XX + // in[6]: 60 61 62 63 XX XX XX XX + // in[7]: 70 71 72 73 XX XX XX XX + // to: + // tr0_0: 00 10 01 11 02 12 03 13 + // tr0_1: 20 30 21 31 22 32 23 33 + // tr0_2: 40 50 41 51 42 52 43 53 + // tr0_3: 60 70 61 71 62 72 63 73 + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_3 = _mm_unpacklo_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // tr1_0: 00 10 20 30 01 11 21 31 + // tr1_1: 40 50 60 70 41 51 61 71 + // tr1_2: 02 12 22 32 03 13 23 33 + // tr1_3: 42 52 62 72 43 53 63 73 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); +} + static INLINE void transpose_16bit_8x8(const __m128i *const in, __m128i *const out) { // Unpack 16 bit elements. Goes from: @@ -99,6 +141,25 @@ static INLINE void transpose_16bit_8x8(const __m128i *const in, out[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); } +// Transpose in-place +static INLINE void transpose_16bit_16x16(__m128i *const left, + __m128i *const right) { + __m128i tbuf[8]; + transpose_16bit_8x8(left, left); + transpose_16bit_8x8(right, tbuf); + transpose_16bit_8x8(left + 8, right); + transpose_16bit_8x8(right + 8, right + 8); + + left[8] = tbuf[0]; + left[9] = tbuf[1]; + left[10] = tbuf[2]; + left[11] = tbuf[3]; + left[12] = tbuf[4]; + left[13] = tbuf[5]; + left[14] = tbuf[6]; + left[15] = tbuf[7]; +} + static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1, __m128i *const a2, __m128i *const a3) { // Unpack 32 bit elements. Goes from: From 6da6a2329132e7173fe00a76e8feb578d4031164 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Tue, 13 Jun 2017 16:53:53 -0700 Subject: [PATCH 4/4] Update high bitdepth load_input_data() in x86 BUG=webm:1412 Change-Id: Ibf9d120b80c7d3a7637e79e123cf2f0aae6dd78c --- vpx_dsp/x86/inv_txfm_sse2.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index bf86afd3c..a739fd1a4 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -80,8 +80,20 @@ static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0, // highbitdepth enabled static INLINE __m128i load_input_data(const tran_low_t *data) { #if CONFIG_VP9_HIGHBITDEPTH - return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], - data[6], data[7]); + // in0: 0 X 1 X 2 X 3 X + // in1: 4 X 5 X 6 X 7 X + // t0: 0 4 X X 1 5 X X + // t1: 2 6 X X 3 7 X X + // t2: 0 2 4 6 X X X X + // t3: 1 3 5 7 X X X X + // rtn: 0 1 2 3 4 5 6 7 + const __m128i in0 = _mm_load_si128((const __m128i *)data); + const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4)); + const __m128i t0 = _mm_unpacklo_epi16(in0, in1); + const __m128i t1 = _mm_unpackhi_epi16(in0, in1); + const __m128i t2 = _mm_unpacklo_epi16(t0, t1); + const __m128i t3 = _mm_unpackhi_epi16(t0, t1); + return _mm_unpacklo_epi16(t2, t3); #else return _mm_load_si128((const __m128i *)data); #endif