From 2b43a1ee1884f5de40dc5fab158d000da5b8654d Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Thu, 15 Jun 2017 16:06:53 -0700 Subject: [PATCH 1/3] Clean 32x32 full idct sse2 and ssse3 code vpx_idct32x32_1024_add_ssse3() is actually a sse2 function and faster than vpx_idct32x32_1024_add_sse2(). Replace the slow one. All are code relocations, no new code. Change-Id: I5dac0e98cc411a4ce05660406921118986638d19 --- test/dct_test.cc | 10 +- test/partial_idct_test.cc | 2 - vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/inv_txfm_sse2.c | 756 +++++++++++++---------------------- vpx_dsp/x86/inv_txfm_sse2.h | 72 ++++ vpx_dsp/x86/inv_txfm_ssse3.c | 375 ----------------- 6 files changed, 358 insertions(+), 859 deletions(-) diff --git a/test/dct_test.cc b/test/dct_test.cc index 5b049c165..8ae6cbb6f 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -498,18 +498,16 @@ INSTANTIATE_TEST_CASE_P( // TODO(johannkoenig): high bit depth fdct8x8. INSTANTIATE_TEST_CASE_P( SSSE3, TransDCT, - ::testing::Values(make_tuple(&vpx_fdct32x32_c, - &vpx_idct32x32_1024_add_ssse3, 32, 0, - VPX_BITS_8), + ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2, + 32, 0, VPX_BITS_8), make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_sse2, 8, 0, VPX_BITS_8))); #else // vpx_fdct8x8_ssse3 is only available in 64 bit builds. INSTANTIATE_TEST_CASE_P( SSSE3, TransDCT, - ::testing::Values(make_tuple(&vpx_fdct32x32_c, - &vpx_idct32x32_1024_add_ssse3, 32, 0, - VPX_BITS_8), + ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2, + 32, 0, VPX_BITS_8), make_tuple(&vpx_fdct8x8_ssse3, &vpx_idct8x8_64_add_sse2, 8, 0, VPX_BITS_8))); #endif // !ARCH_X86_64 diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 8b99766b5..eb55f0ae8 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -738,8 +738,6 @@ INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest, #if HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE const PartialInvTxfmParam ssse3_partial_idct_tests[] = { - make_tuple(&vpx_fdct32x32_c, &wrapper, - &wrapper, TX_32X32, 1024, 8, 1), make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 135, 8, 1), make_tuple(&vpx_fdct32x32_c, &wrapper, diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 371c2455f..f587db8cf 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -592,7 +592,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2; specialize qw/vpx_idct16x16_10_add neon sse2/; specialize qw/vpx_idct16x16_1_add neon sse2/; - specialize qw/vpx_idct32x32_1024_add neon sse2 ssse3/; + specialize qw/vpx_idct32x32_1024_add neon sse2/; specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/; $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/; diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index 9e77bdbdb..b33d1e427 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -1583,12 +1583,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, } } -#define LOAD_DQCOEFF(reg, input) \ - { \ - reg = load_input_data(input); \ - input += 8; \ - } - #define IDCT32_34 \ /* Stage1 */ \ multiplication_and_add_2(&in[1], &zero, &stg1_0, &stg1_1, &stp1_16, \ @@ -1787,225 +1781,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, stp1_30 = stp2_30; \ stp1_31 = stp2_31; -#define IDCT32 \ - /* Stage1 */ \ - multiplication_and_add(&in[1], &in[31], &in[17], &in[15], &stg1_0, &stg1_1, \ - &stg1_2, &stg1_3, &stp1_16, &stp1_31, &stp1_17, \ - &stp1_30); \ - multiplication_and_add(&in[9], &in[23], &in[25], &in[7], &stg1_4, &stg1_5, \ - &stg1_6, &stg1_7, &stp1_18, &stp1_29, &stp1_19, \ - &stp1_28); \ - multiplication_and_add(&in[5], &in[27], &in[21], &in[11], &stg1_8, &stg1_9, \ - &stg1_10, &stg1_11, &stp1_20, &stp1_27, &stp1_21, \ - &stp1_26); \ - multiplication_and_add(&in[13], &in[19], &in[29], &in[3], &stg1_12, \ - &stg1_13, &stg1_14, &stg1_15, &stp1_22, &stp1_25, \ - &stp1_23, &stp1_24); \ - \ - /* Stage2 */ \ - multiplication_and_add(&in[2], &in[30], &in[18], &in[14], &stg2_0, &stg2_1, \ - &stg2_2, &stg2_3, &stp2_8, &stp2_15, &stp2_9, \ - &stp2_14); \ - multiplication_and_add(&in[10], &in[22], &in[26], &in[6], &stg2_4, &stg2_5, \ - &stg2_6, &stg2_7, &stp2_10, &stp2_13, &stp2_11, \ - &stp2_12); \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ - stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ - stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ - \ - stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ - stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ - stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ - stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ - \ - stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ - stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ - stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ - \ - /* Stage3 */ \ - multiplication_and_add(&in[4], &in[28], &in[20], &in[12], &stg3_0, &stg3_1, \ - &stg3_2, &stg3_3, &stp1_4, &stp1_7, &stp1_5, \ - &stp1_6); \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - \ - multiplication_and_add(&stp2_17, &stp2_30, &stp2_18, &stp2_29, &stg3_4, \ - &stg3_5, &stg3_6, &stg3_4, &stp1_17, &stp1_30, \ - &stp1_18, &stp1_29); \ - multiplication_and_add(&stp2_21, &stp2_26, &stp2_22, &stp2_25, &stg3_8, \ - &stg3_9, &stg3_10, &stg3_8, &stp1_21, &stp1_26, \ - &stp1_22, &stp1_25); \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - \ - /* Stage4 */ \ - multiplication_and_add(&in[0], &in[16], &in[8], &in[24], &stg4_0, &stg4_1, \ - &stg4_2, &stg4_3, &stp2_0, &stp2_1, &stp2_2, \ - &stp2_3); \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - multiplication_and_add(&stp1_9, &stp1_14, &stp1_10, &stp1_13, &stg4_4, \ - &stg4_5, &stg4_6, &stg4_4, &stp2_9, &stp2_14, \ - &stp2_10, &stp2_13); \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - \ - /* Stage5 */ \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - multiplication_and_add_2(&stp2_6, &stp2_5, &stg4_1, &stg4_0, &stp1_5, \ - &stp1_6); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - multiplication_and_add(&stp2_18, &stp2_29, &stp2_19, &stp2_28, &stg4_4, \ - &stg4_5, &stg4_4, &stg4_5, &stp1_18, &stp1_29, \ - &stp1_19, &stp1_28); \ - multiplication_and_add(&stp2_20, &stp2_27, &stp2_21, &stp2_26, &stg4_6, \ - &stg4_4, &stg4_6, &stg4_4, &stp1_20, &stp1_27, \ - &stp1_21, &stp1_26); \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - \ - /* Stage6 */ \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - multiplication_and_add(&stp1_10, &stp1_13, &stp1_11, &stp1_12, &stg6_0, \ - &stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13, \ - &stp2_11, &stp2_12); \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - \ - /* Stage7 */ \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - multiplication_and_add(&stp2_20, &stp2_27, &stp2_21, &stp2_26, &stg6_0, \ - &stg4_0, &stg6_0, &stg4_0, &stp1_20, &stp1_27, \ - &stp1_21, &stp1_26); \ - multiplication_and_add(&stp2_22, &stp2_25, &stp2_23, &stp2_24, &stg6_0, \ - &stg4_0, &stg6_0, &stg4_0, &stp1_22, &stp1_25, \ - &stp1_23, &stp1_24); \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; - // Only upper-left 8x8 has non-zero coeff void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { @@ -2152,178 +1927,284 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, } } +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i in[32] +static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/, + __m128i *out /*out[8]*/) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_ + __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_ + + { + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7); + butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6); + } + + v4 = _mm_add_epi16(u4, u5); + v5 = _mm_sub_epi16(u4, u5); + v6 = _mm_sub_epi16(u7, u6); + v7 = _mm_add_epi16(u7, u6); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); + + butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1); + butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3); + } + + v0 = _mm_add_epi16(u0, u3); + v1 = _mm_add_epi16(u1, u2); + v2 = _mm_sub_epi16(u1, u2); + v3 = _mm_sub_epi16(u0, u3); + + out[0] = _mm_add_epi16(v0, v7); + out[1] = _mm_add_epi16(v1, v6); + out[2] = _mm_add_epi16(v2, v5); + out[3] = _mm_add_epi16(v3, v4); + out[4] = _mm_sub_epi16(v3, v4); + out[5] = _mm_sub_epi16(v2, v5); + out[6] = _mm_sub_epi16(v1, v6); + out[7] = _mm_sub_epi16(v0, v7); +} + +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i in[32] +static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_ + __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_ + + { + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15); + butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14); + } + + v8 = _mm_add_epi16(u8, u9); + v9 = _mm_sub_epi16(u8, u9); + v14 = _mm_sub_epi16(u15, u14); + v15 = _mm_add_epi16(u15, u14); + + { + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13); + butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12); + } + + v10 = _mm_sub_epi16(u11, u10); + v11 = _mm_add_epi16(u11, u10); + v12 = _mm_add_epi16(u12, u13); + v13 = _mm_sub_epi16(u12, u13); + + { + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&v9, &v14, &stg4_4, &stg4_5); + butterfly_self(&v10, &v13, &stg4_6, &stg4_4); + } + + out[0] = _mm_add_epi16(v8, v11); + out[1] = _mm_add_epi16(v9, v10); + out[6] = _mm_add_epi16(v14, v13); + out[7] = _mm_add_epi16(v15, v12); + + out[2] = _mm_sub_epi16(v9, v10); + out[3] = _mm_sub_epi16(v8, v11); + out[4] = _mm_sub_epi16(v15, v12); + out[5] = _mm_sub_epi16(v14, v13); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); + butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); + } +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i in[32] +// We avoid hide an offset, 16, inside this function. So we output 0-15 into +// array out[16] +static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i v16, v17, v18, v19, v20, v21, v22, v23; + __m128i v24, v25, v26, v27, v28, v29, v30, v31; + __m128i u16, u17, u18, u19, u20, u21, u22, u23; + __m128i u24, u25, u26, u27, u28, u29, u30, u31; + + { + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31); + butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30); + butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29); + butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28); + + butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27); + butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26); + + butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25); + butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24); + } + + v16 = _mm_add_epi16(u16, u17); + v17 = _mm_sub_epi16(u16, u17); + v18 = _mm_sub_epi16(u19, u18); + v19 = _mm_add_epi16(u19, u18); + + v20 = _mm_add_epi16(u20, u21); + v21 = _mm_sub_epi16(u20, u21); + v22 = _mm_sub_epi16(u23, u22); + v23 = _mm_add_epi16(u23, u22); + + v24 = _mm_add_epi16(u24, u25); + v25 = _mm_sub_epi16(u24, u25); + v26 = _mm_sub_epi16(u27, u26); + v27 = _mm_add_epi16(u27, u26); + + v28 = _mm_add_epi16(u28, u29); + v29 = _mm_sub_epi16(u28, u29); + v30 = _mm_sub_epi16(u31, u30); + v31 = _mm_add_epi16(u31, u30); + + { + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + butterfly_self(&v17, &v30, &stg3_4, &stg3_5); + butterfly_self(&v18, &v29, &stg3_6, &stg3_4); + butterfly_self(&v21, &v26, &stg3_8, &stg3_9); + butterfly_self(&v22, &v25, &stg3_10, &stg3_8); + } + + u16 = _mm_add_epi16(v16, v19); + u17 = _mm_add_epi16(v17, v18); + u18 = _mm_sub_epi16(v17, v18); + u19 = _mm_sub_epi16(v16, v19); + u20 = _mm_sub_epi16(v23, v20); + u21 = _mm_sub_epi16(v22, v21); + u22 = _mm_add_epi16(v22, v21); + u23 = _mm_add_epi16(v23, v20); + + u24 = _mm_add_epi16(v24, v27); + u25 = _mm_add_epi16(v25, v26); + u26 = _mm_sub_epi16(v25, v26); + u27 = _mm_sub_epi16(v24, v27); + + u28 = _mm_sub_epi16(v31, v28); + u29 = _mm_sub_epi16(v30, v29); + u30 = _mm_add_epi16(v29, v30); + u31 = _mm_add_epi16(v28, v31); + + { + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&u18, &u29, &stg4_4, &stg4_5); + butterfly_self(&u19, &u28, &stg4_4, &stg4_5); + butterfly_self(&u20, &u27, &stg4_6, &stg4_4); + butterfly_self(&u21, &u26, &stg4_6, &stg4_4); + } + + out[0] = _mm_add_epi16(u16, u23); + out[1] = _mm_add_epi16(u17, u22); + out[2] = _mm_add_epi16(u18, u21); + out[3] = _mm_add_epi16(u19, u20); + out[4] = _mm_sub_epi16(u19, u20); + out[5] = _mm_sub_epi16(u18, u21); + out[6] = _mm_sub_epi16(u17, u22); + out[7] = _mm_sub_epi16(u16, u23); + + out[8] = _mm_sub_epi16(u31, u24); + out[9] = _mm_sub_epi16(u30, u25); + out[10] = _mm_sub_epi16(u29, u26); + out[11] = _mm_sub_epi16(u28, u27); + out[12] = _mm_add_epi16(u27, u28); + out[13] = _mm_add_epi16(u26, u29); + out[14] = _mm_add_epi16(u25, u30); + out[15] = _mm_add_epi16(u24, u31); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0); + butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0); + butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0); + butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0); + } +} + +static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[32]*/) { + __m128i temp[16]; + idct32_full_8x32_quarter_1(in, temp); + idct32_full_8x32_quarter_2(in, &temp[8]); + add_sub_butterfly(temp, out, 16); +} + +static void idct32_full_8x32(const __m128i *in /*in[32]*/, + __m128i *out /*out[32]*/) { + __m128i temp[32]; + idct32_full_8x32_quarter_1_2(in, temp); + idct32_full_8x32_quarter_3_4(in, &temp[16]); + add_sub_butterfly(temp, out, 32); +} + +static void load_buffer_8x32(const tran_low_t *input, __m128i *in) { + int i; + for (i = 0; i < 8; ++i) { + in[i] = load_input_data(input); + in[i + 8] = load_input_data(input + 8); + in[i + 16] = load_input_data(input + 16); + in[i + 24] = load_input_data(input + 24); + input += 32; + } +} + void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); + __m128i col[128], in[32]; + int i, j; - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[128], zero_idx[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - int i, j, i32; - - for (i = 0; i < 4; i++) { - i32 = (i << 5); - // First 1-D idct - // Load input data. - LOAD_DQCOEFF(in[0], input); - LOAD_DQCOEFF(in[8], input); - LOAD_DQCOEFF(in[16], input); - LOAD_DQCOEFF(in[24], input); - LOAD_DQCOEFF(in[1], input); - LOAD_DQCOEFF(in[9], input); - LOAD_DQCOEFF(in[17], input); - LOAD_DQCOEFF(in[25], input); - LOAD_DQCOEFF(in[2], input); - LOAD_DQCOEFF(in[10], input); - LOAD_DQCOEFF(in[18], input); - LOAD_DQCOEFF(in[26], input); - LOAD_DQCOEFF(in[3], input); - LOAD_DQCOEFF(in[11], input); - LOAD_DQCOEFF(in[19], input); - LOAD_DQCOEFF(in[27], input); - - LOAD_DQCOEFF(in[4], input); - LOAD_DQCOEFF(in[12], input); - LOAD_DQCOEFF(in[20], input); - LOAD_DQCOEFF(in[28], input); - LOAD_DQCOEFF(in[5], input); - LOAD_DQCOEFF(in[13], input); - LOAD_DQCOEFF(in[21], input); - LOAD_DQCOEFF(in[29], input); - LOAD_DQCOEFF(in[6], input); - LOAD_DQCOEFF(in[14], input); - LOAD_DQCOEFF(in[22], input); - LOAD_DQCOEFF(in[30], input); - LOAD_DQCOEFF(in[7], input); - LOAD_DQCOEFF(in[15], input); - LOAD_DQCOEFF(in[23], input); - LOAD_DQCOEFF(in[31], input); - - // checking if all entries are zero - zero_idx[0] = _mm_or_si128(in[0], in[1]); - zero_idx[1] = _mm_or_si128(in[2], in[3]); - zero_idx[2] = _mm_or_si128(in[4], in[5]); - zero_idx[3] = _mm_or_si128(in[6], in[7]); - zero_idx[4] = _mm_or_si128(in[8], in[9]); - zero_idx[5] = _mm_or_si128(in[10], in[11]); - zero_idx[6] = _mm_or_si128(in[12], in[13]); - zero_idx[7] = _mm_or_si128(in[14], in[15]); - zero_idx[8] = _mm_or_si128(in[16], in[17]); - zero_idx[9] = _mm_or_si128(in[18], in[19]); - zero_idx[10] = _mm_or_si128(in[20], in[21]); - zero_idx[11] = _mm_or_si128(in[22], in[23]); - zero_idx[12] = _mm_or_si128(in[24], in[25]); - zero_idx[13] = _mm_or_si128(in[26], in[27]); - zero_idx[14] = _mm_or_si128(in[28], in[29]); - zero_idx[15] = _mm_or_si128(in[30], in[31]); - - zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); - zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); - - zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); - - if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { - col[i32 + 0] = _mm_setzero_si128(); - col[i32 + 1] = _mm_setzero_si128(); - col[i32 + 2] = _mm_setzero_si128(); - col[i32 + 3] = _mm_setzero_si128(); - col[i32 + 4] = _mm_setzero_si128(); - col[i32 + 5] = _mm_setzero_si128(); - col[i32 + 6] = _mm_setzero_si128(); - col[i32 + 7] = _mm_setzero_si128(); - col[i32 + 8] = _mm_setzero_si128(); - col[i32 + 9] = _mm_setzero_si128(); - col[i32 + 10] = _mm_setzero_si128(); - col[i32 + 11] = _mm_setzero_si128(); - col[i32 + 12] = _mm_setzero_si128(); - col[i32 + 13] = _mm_setzero_si128(); - col[i32 + 14] = _mm_setzero_si128(); - col[i32 + 15] = _mm_setzero_si128(); - col[i32 + 16] = _mm_setzero_si128(); - col[i32 + 17] = _mm_setzero_si128(); - col[i32 + 18] = _mm_setzero_si128(); - col[i32 + 19] = _mm_setzero_si128(); - col[i32 + 20] = _mm_setzero_si128(); - col[i32 + 21] = _mm_setzero_si128(); - col[i32 + 22] = _mm_setzero_si128(); - col[i32 + 23] = _mm_setzero_si128(); - col[i32 + 24] = _mm_setzero_si128(); - col[i32 + 25] = _mm_setzero_si128(); - col[i32 + 26] = _mm_setzero_si128(); - col[i32 + 27] = _mm_setzero_si128(); - col[i32 + 28] = _mm_setzero_si128(); - col[i32 + 29] = _mm_setzero_si128(); - col[i32 + 30] = _mm_setzero_si128(); - col[i32 + 31] = _mm_setzero_si128(); - continue; - } + // rows + for (i = 0; i < 4; ++i) { + load_buffer_8x32(input, in); + input += 32 << 3; // Transpose 32x8 block to 8x32 block transpose_16bit_8x8(in, in); @@ -2331,95 +2212,20 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, transpose_16bit_8x8(in + 16, in + 16); transpose_16bit_8x8(in + 24, in + 24); - IDCT32 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); - col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); - col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); - col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); - col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); - col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); - col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); - col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); - col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); - col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); - col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); - col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); - col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); - col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); - col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); - col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); - col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); - col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); - col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); - col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); - col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); - col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); - col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); - col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); - col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); - col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); - col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); - col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); - col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); - col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); - col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); - col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + idct32_full_8x32(in, col + (i << 5)); } - for (i = 0; i < 4; i++) { - // Second 1-D idct - j = i << 3; + // columns + for (i = 0; i < 4; ++i) { + j = i << 3; // Transpose 32x8 block to 8x32 block transpose_16bit_8x8(col + j, in); transpose_16bit_8x8(col + j + 32, in + 8); transpose_16bit_8x8(col + j + 64, in + 16); transpose_16bit_8x8(col + j + 96, in + 24); - IDCT32 - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); - - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - recon_and_store(dest + j * stride, in[j]); - } - + idct32_full_8x32(in, in); + store_buffer_8x32(in, dest, stride); dest += 8; } } diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index acaf86178..cfe5f788e 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -258,6 +258,78 @@ static INLINE void recon_and_store4x4_sse2(const __m128i *const in, *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); } +static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + int j = 0; + while (j < 32) { + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); + + in[j] = _mm_srai_epi16(in[j], 6); + in[j + 1] = _mm_srai_epi16(in[j + 1], 6); + + recon_and_store(dst, in[j]); + dst += stride; + recon_and_store(dst, in[j + 1]); + dst += stride; + j += 2; + } +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out, + int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm_add_epi16(in[i], in[bound - i]); + out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); + i++; + } +} + +#define BUTTERFLY_PAIR(x0, x1, co0, co1) \ + do { \ + tmp0 = _mm_madd_epi16(x0, co0); \ + tmp1 = _mm_madd_epi16(x1, co0); \ + tmp2 = _mm_madd_epi16(x0, co1); \ + tmp3 = _mm_madd_epi16(x1, co1); \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + } while (0) + +static INLINE void butterfly(const __m128i *x0, const __m128i *x1, + const __m128i *c0, const __m128i *c1, __m128i *y0, + __m128i *y1) { + __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + + u0 = _mm_unpacklo_epi16(*x0, *x1); + u1 = _mm_unpackhi_epi16(*x0, *x1); + BUTTERFLY_PAIR(u0, u1, *c0, *c1); + *y0 = _mm_packs_epi32(tmp0, tmp1); + *y1 = _mm_packs_epi32(tmp2, tmp3); +} + +static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0, + const __m128i *c1) { + __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + + u0 = _mm_unpacklo_epi16(*x0, *x1); + u1 = _mm_unpackhi_epi16(*x0, *x1); + BUTTERFLY_PAIR(u0, u1, *c0, *c1); + *x0 = _mm_packs_epi32(tmp0, tmp1); + *x1 = _mm_packs_epi32(tmp2, tmp3); +} + void idct4_sse2(__m128i *in); void idct8_sse2(__m128i *in); void idct16_sse2(__m128i *in0, __m128i *in1); diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c index 0e86e43f1..1a9fe51d7 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3.c +++ b/vpx_dsp/x86/inv_txfm_ssse3.c @@ -150,60 +150,6 @@ void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, write_buffer_8x8(in, dest, stride); } -// Only do addition and subtraction butterfly, size = 16, 32 -static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out, - int size) { - int i = 0; - const int num = size >> 1; - const int bound = size - 1; - while (i < num) { - out[i] = _mm_add_epi16(in[i], in[bound - i]); - out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); - i++; - } -} - -#define BUTTERFLY_PAIR(x0, x1, co0, co1) \ - do { \ - tmp0 = _mm_madd_epi16(x0, co0); \ - tmp1 = _mm_madd_epi16(x1, co0); \ - tmp2 = _mm_madd_epi16(x0, co1); \ - tmp3 = _mm_madd_epi16(x1, co1); \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - } while (0) - -static INLINE void butterfly(const __m128i *x0, const __m128i *x1, - const __m128i *c0, const __m128i *c1, __m128i *y0, - __m128i *y1) { - __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - - u0 = _mm_unpacklo_epi16(*x0, *x1); - u1 = _mm_unpackhi_epi16(*x0, *x1); - BUTTERFLY_PAIR(u0, u1, *c0, *c1); - *y0 = _mm_packs_epi32(tmp0, tmp1); - *y1 = _mm_packs_epi32(tmp2, tmp3); -} - -static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0, - const __m128i *c1) { - __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - - u0 = _mm_unpacklo_epi16(*x0, *x1); - u1 = _mm_unpackhi_epi16(*x0, *x1); - BUTTERFLY_PAIR(u0, u1, *c0, *c1); - *x0 = _mm_packs_epi32(tmp0, tmp1); - *x1 = _mm_packs_epi32(tmp2, tmp3); -} - static void idct32_34_first_half(const __m128i *in, __m128i *stp1) { const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); @@ -715,24 +661,6 @@ static void idct32_8x32_135(__m128i *in /*in[32]*/) { add_sub_butterfly(out, in, 32); } -static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - int j = 0; - while (j < 32) { - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); - - in[j] = _mm_srai_epi16(in[j], 6); - in[j + 1] = _mm_srai_epi16(in[j + 1], 6); - - recon_and_store(dst, in[j]); - dst += stride; - recon_and_store(dst, in[j + 1]); - dst += stride; - j += 2; - } -} - static INLINE void recon_and_store_ssse3(__m128i *in0, __m128i *in1, uint8_t *dest, int stride) { store_buffer_8x32(in0, dest, stride); @@ -793,306 +721,3 @@ void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, idct32_135(col0, col1); recon_and_store_ssse3(col0, col1, dest + 16, stride); } - -// For each 8x32 block __m128i in[32], -// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 -// output pixels: 8-15 in __m128i in[32] -static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/, - __m128i *out /*out[16]*/) { - __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_ - __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_ - - { - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15); - butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14); - } - - v8 = _mm_add_epi16(u8, u9); - v9 = _mm_sub_epi16(u8, u9); - v14 = _mm_sub_epi16(u15, u14); - v15 = _mm_add_epi16(u15, u14); - - { - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13); - butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12); - } - - v10 = _mm_sub_epi16(u11, u10); - v11 = _mm_add_epi16(u11, u10); - v12 = _mm_add_epi16(u12, u13); - v13 = _mm_sub_epi16(u12, u13); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&v9, &v14, &stg4_4, &stg4_5); - butterfly_self(&v10, &v13, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(v8, v11); - out[1] = _mm_add_epi16(v9, v10); - out[6] = _mm_add_epi16(v14, v13); - out[7] = _mm_add_epi16(v15, v12); - - out[2] = _mm_sub_epi16(v9, v10); - out[3] = _mm_sub_epi16(v8, v11); - out[4] = _mm_sub_epi16(v15, v12); - out[5] = _mm_sub_epi16(v14, v13); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); - butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); - } -} - -// For each 8x32 block __m128i in[32], -// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 -// output pixels: 0-7 in __m128i in[32] -static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/, - __m128i *out /*out[8]*/) { - __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_ - __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_ - - { - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7); - butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6); - } - - v4 = _mm_add_epi16(u4, u5); - v5 = _mm_sub_epi16(u4, u5); - v6 = _mm_sub_epi16(u7, u6); - v7 = _mm_add_epi16(u7, u6); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); - - butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1); - butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3); - } - - v0 = _mm_add_epi16(u0, u3); - v1 = _mm_add_epi16(u1, u2); - v2 = _mm_sub_epi16(u1, u2); - v3 = _mm_sub_epi16(u0, u3); - - out[0] = _mm_add_epi16(v0, v7); - out[1] = _mm_add_epi16(v1, v6); - out[2] = _mm_add_epi16(v2, v5); - out[3] = _mm_add_epi16(v3, v4); - out[4] = _mm_sub_epi16(v3, v4); - out[5] = _mm_sub_epi16(v2, v5); - out[6] = _mm_sub_epi16(v1, v6); - out[7] = _mm_sub_epi16(v0, v7); -} - -// For each 8x32 block __m128i in[32], -// Input with odd index, -// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 -// output pixels: 16-23, 24-31 in __m128i in[32] -// We avoid hide an offset, 16, inside this function. So we output 0-15 into -// array out[16] -static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/, - __m128i *out /*out[16]*/) { - __m128i v16, v17, v18, v19, v20, v21, v22, v23; - __m128i v24, v25, v26, v27, v28, v29, v30, v31; - __m128i u16, u17, u18, u19, u20, u21, u22, u23; - __m128i u24, u25, u26, u27, u28, u29, u30, u31; - - { - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31); - butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30); - butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29); - butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28); - - butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27); - butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26); - - butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25); - butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24); - } - - v16 = _mm_add_epi16(u16, u17); - v17 = _mm_sub_epi16(u16, u17); - v18 = _mm_sub_epi16(u19, u18); - v19 = _mm_add_epi16(u19, u18); - - v20 = _mm_add_epi16(u20, u21); - v21 = _mm_sub_epi16(u20, u21); - v22 = _mm_sub_epi16(u23, u22); - v23 = _mm_add_epi16(u23, u22); - - v24 = _mm_add_epi16(u24, u25); - v25 = _mm_sub_epi16(u24, u25); - v26 = _mm_sub_epi16(u27, u26); - v27 = _mm_add_epi16(u27, u26); - - v28 = _mm_add_epi16(u28, u29); - v29 = _mm_sub_epi16(u28, u29); - v30 = _mm_sub_epi16(u31, u30); - v31 = _mm_add_epi16(u31, u30); - - { - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - butterfly_self(&v17, &v30, &stg3_4, &stg3_5); - butterfly_self(&v18, &v29, &stg3_6, &stg3_4); - butterfly_self(&v21, &v26, &stg3_8, &stg3_9); - butterfly_self(&v22, &v25, &stg3_10, &stg3_8); - } - - u16 = _mm_add_epi16(v16, v19); - u17 = _mm_add_epi16(v17, v18); - u18 = _mm_sub_epi16(v17, v18); - u19 = _mm_sub_epi16(v16, v19); - u20 = _mm_sub_epi16(v23, v20); - u21 = _mm_sub_epi16(v22, v21); - u22 = _mm_add_epi16(v22, v21); - u23 = _mm_add_epi16(v23, v20); - - u24 = _mm_add_epi16(v24, v27); - u25 = _mm_add_epi16(v25, v26); - u26 = _mm_sub_epi16(v25, v26); - u27 = _mm_sub_epi16(v24, v27); - - u28 = _mm_sub_epi16(v31, v28); - u29 = _mm_sub_epi16(v30, v29); - u30 = _mm_add_epi16(v29, v30); - u31 = _mm_add_epi16(v28, v31); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(u16, u23); - out[1] = _mm_add_epi16(u17, u22); - out[2] = _mm_add_epi16(u18, u21); - out[3] = _mm_add_epi16(u19, u20); - out[4] = _mm_sub_epi16(u19, u20); - out[5] = _mm_sub_epi16(u18, u21); - out[6] = _mm_sub_epi16(u17, u22); - out[7] = _mm_sub_epi16(u16, u23); - - out[8] = _mm_sub_epi16(u31, u24); - out[9] = _mm_sub_epi16(u30, u25); - out[10] = _mm_sub_epi16(u29, u26); - out[11] = _mm_sub_epi16(u28, u27); - out[12] = _mm_add_epi16(u27, u28); - out[13] = _mm_add_epi16(u26, u29); - out[14] = _mm_add_epi16(u25, u30); - out[15] = _mm_add_epi16(u24, u31); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0); - butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0); - butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0); - butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0); - } -} - -static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/, - __m128i *out /*out[32]*/) { - __m128i temp[16]; - idct32_full_8x32_quarter_1(in, temp); - idct32_full_8x32_quarter_2(in, &temp[8]); - add_sub_butterfly(temp, out, 16); -} - -static void idct32_full_8x32(const __m128i *in /*in[32]*/, - __m128i *out /*out[32]*/) { - __m128i temp[32]; - idct32_full_8x32_quarter_1_2(in, temp); - idct32_full_8x32_quarter_3_4(in, &temp[16]); - add_sub_butterfly(temp, out, 32); -} - -static void load_buffer_8x32(const tran_low_t *input, __m128i *in) { - int i; - for (i = 0; i < 8; ++i) { - in[i] = load_input_data(input); - in[i + 8] = load_input_data(input + 8); - in[i + 16] = load_input_data(input + 16); - in[i + 24] = load_input_data(input + 24); - input += 32; - } -} - -void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i col[128], in[32]; - int i, j; - - // rows - for (i = 0; i < 4; ++i) { - load_buffer_8x32(input, in); - input += 32 << 3; - - // Transpose 32x8 block to 8x32 block - transpose_16bit_8x8(in, in); - transpose_16bit_8x8(in + 8, in + 8); - transpose_16bit_8x8(in + 16, in + 16); - transpose_16bit_8x8(in + 24, in + 24); - - idct32_full_8x32(in, col + (i << 5)); - } - - // columns - for (i = 0; i < 4; ++i) { - j = i << 3; - // Transpose 32x8 block to 8x32 block - transpose_16bit_8x8(col + j, in); - transpose_16bit_8x8(col + j + 32, in + 8); - transpose_16bit_8x8(col + j + 64, in + 16); - transpose_16bit_8x8(col + j + 96, in + 24); - - idct32_full_8x32(in, in); - store_buffer_8x32(in, dest, stride); - dest += 8; - } -} From 42522ce0b7af7384a201f73b48258730fd232470 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Thu, 15 Jun 2017 16:48:40 -0700 Subject: [PATCH 2/3] Update vpx_idct{8x8,16x16,32x32}_1_add_sse2() Change-Id: I365f8e53d9ccd028cef0f561d4de9e5916278609 --- vpx_dsp/x86/inv_txfm_sse2.c | 89 ++++++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 32 deletions(-) diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index b33d1e427..1a8a61f92 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -233,25 +233,40 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, write_buffer_8x8(in, dest, stride); } +static INLINE void recon_and_store_8_dual(uint8_t *const dest, + const __m128i in_x, + const int stride) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0, d1; + + d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride)); + d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride)); + d0 = _mm_unpacklo_epi8(d0, zero); + d1 = _mm_unpacklo_epi8(d1, zero); + d0 = _mm_add_epi16(in_x, d0); + d1 = _mm_add_epi16(in_x, d1); + d0 = _mm_packus_epi16(d0, d1); + _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0); + _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0)); +} + void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - int a; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 5); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 5); + dc_value = _mm_set1_epi16(a1); - dc_value = _mm_set1_epi16(a); - - recon_and_store(dest + 0 * stride, dc_value); - recon_and_store(dest + 1 * stride, dc_value); - recon_and_store(dest + 2 * stride, dc_value); - recon_and_store(dest + 3 * stride, dc_value); - recon_and_store(dest + 4 * stride, dc_value); - recon_and_store(dest + 5 * stride, dc_value); - recon_and_store(dest + 6 * stride, dc_value); - recon_and_store(dest + 7 * stride, dc_value); + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); } void idct8_sse2(__m128i *in) { @@ -784,20 +799,32 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, } } +static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0, d1; + + d0 = _mm_load_si128((__m128i *)(dest)); + d1 = _mm_unpackhi_epi8(d0, zero); + d0 = _mm_unpacklo_epi8(d0, zero); + d0 = _mm_add_epi16(in_x, d0); + d1 = _mm_add_epi16(in_x, d1); + d0 = _mm_packus_epi16(d0, d1); + _mm_store_si128((__m128i *)(dest), d0); +} + void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - int a, i; + int i; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - dc_value = _mm_set1_epi16(a); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + dc_value = _mm_set1_epi16(a1); for (i = 0; i < 16; ++i) { - recon_and_store(dest + 0, dc_value); - recon_and_store(dest + 8, dc_value); + recon_and_store_16(dest, dc_value); dest += stride; } } @@ -2233,18 +2260,16 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - int a, j; + int j; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - dc_value = _mm_set1_epi16(a); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + dc_value = _mm_set1_epi16(a1); for (j = 0; j < 32; ++j) { - recon_and_store(dest + 0 + j * stride, dc_value); - recon_and_store(dest + 8 + j * stride, dc_value); - recon_and_store(dest + 16 + j * stride, dc_value); - recon_and_store(dest + 24 + j * stride, dc_value); + recon_and_store_16(dest + j * stride + 0, dc_value); + recon_and_store_16(dest + j * stride + 16, dc_value); } } From 466b667ff34f516f122d1d7d78e203c5c89b7249 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Mon, 19 Jun 2017 18:10:38 -0700 Subject: [PATCH 3/3] Clean vpx_idct16x16_256_add_sse2() Remove macro IDCT16 which is redundant with idct16_8col(). Change-Id: I783c5f4fda038a22d5ee5c2b22e8c2cdfb38432c --- vpx_dsp/x86/inv_txfm_sse2.c | 492 +++++++++++------------------------- 1 file changed, 145 insertions(+), 347 deletions(-) diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index 1a8a61f92..99f5570cb 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -553,77 +553,6 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, write_buffer_8x8(in, dest, stride); } -#define IDCT16 \ - /* Stage2 */ \ - multiplication_and_add(&in[1], &in[15], &in[9], &in[7], &stg2_0, &stg2_1, \ - &stg2_2, &stg2_3, &stp2_8, &stp2_15, &stp2_9, \ - &stp2_14); \ - \ - multiplication_and_add(&in[5], &in[11], &in[13], &in[3], &stg2_4, &stg2_5, \ - &stg2_6, &stg2_7, &stp2_10, &stp2_13, &stp2_11, \ - &stp2_12); \ - \ - /* Stage3 */ \ - multiplication_and_add(&in[2], &in[14], &in[10], &in[6], &stg3_0, &stg3_1, \ - &stg3_2, &stg3_3, &stp1_4, &stp1_7, &stp1_5, \ - &stp1_6); \ - \ - stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - \ - stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - \ - /* Stage4 */ \ - multiplication_and_add(&in[0], &in[8], &in[4], &in[12], &stg4_0, &stg4_1, \ - &stg4_2, &stg4_3, &stp2_0, &stp2_1, &stp2_2, \ - &stp2_3); \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - multiplication_and_add(&stp1_9, &stp1_14, &stp1_10, &stp1_13, &stg4_4, \ - &stg4_5, &stg4_6, &stg4_7, &stp2_9, &stp2_14, \ - &stp2_10, &stp2_13); \ - \ - /* Stage5 */ \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - multiplication_and_add_2(&stp2_6, &stp2_5, &stg4_1, &stg4_0, &stp1_5, \ - &stp1_6); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - \ - /* Stage6 */ \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - multiplication_and_add(&stp1_10, &stp1_13, &stp1_11, &stp1_12, &stg6_0, \ - &stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13, \ - &stp2_11, &stp2_12); - #define IDCT16_10 \ /* Stage2 */ \ multiplication_and_add(&in[1], &zero, &zero, &in[3], &stg2_0, &stg2_1, \ @@ -677,122 +606,164 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, &stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13, \ &stp2_11, &stp2_12); +static INLINE void idct16_8col(__m128i *const in) { + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i s[16], t[16]; + + // stage 2 + { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + multiplication_and_add(&in[1], &in[15], &in[9], &in[7], &k__cospi_p30_m02, + &k__cospi_p02_p30, &k__cospi_p14_m18, + &k__cospi_p18_p14, &s[8], &s[15], &s[9], &s[14]); + } + { + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + multiplication_and_add(&in[5], &in[11], &in[13], &in[3], &k__cospi_p22_m10, + &k__cospi_p10_p22, &k__cospi_p06_m26, + &k__cospi_p26_p06, &s[10], &s[13], &s[11], &s[12]); + } + + // stage 3 + { + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + multiplication_and_add(&in[2], &in[14], &in[10], &in[6], &k__cospi_p28_m04, + &k__cospi_p04_p28, &k__cospi_p12_m20, + &k__cospi_p20_p12, &t[4], &t[7], &t[5], &t[6]); + } + t[8] = _mm_add_epi16(s[8], s[9]); + t[9] = _mm_sub_epi16(s[8], s[9]); + t[10] = _mm_sub_epi16(s[11], s[10]); + t[11] = _mm_add_epi16(s[10], s[11]); + t[12] = _mm_add_epi16(s[12], s[13]); + t[13] = _mm_sub_epi16(s[12], s[13]); + t[14] = _mm_sub_epi16(s[15], s[14]); + t[15] = _mm_add_epi16(s[14], s[15]); + + // stage 4 + { + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + multiplication_and_add(&in[0], &in[8], &in[4], &in[12], &k__cospi_p16_p16, + &k__cospi_p16_m16, &k__cospi_p24_m08, + &k__cospi_p08_p24, &s[0], &s[1], &s[2], &s[3]); + } + s[5] = _mm_sub_epi16(t[4], t[5]); + t[4] = _mm_add_epi16(t[4], t[5]); + s[6] = _mm_sub_epi16(t[7], t[6]); + t[7] = _mm_add_epi16(t[6], t[7]); + s[8] = t[8]; + { + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + multiplication_and_add(&t[9], &t[14], &t[10], &t[13], &k__cospi_m08_p24, + &k__cospi_p24_p08, &k__cospi_m24_m08, + &k__cospi_m08_p24, &s[9], &s[14], &s[10], &s[13]); + } + s[11] = t[11]; + s[12] = t[12]; + s[15] = t[15]; + + // stage 5 + t[0] = _mm_add_epi16(s[0], s[3]); + t[1] = _mm_add_epi16(s[1], s[2]); + t[2] = _mm_sub_epi16(s[1], s[2]); + t[3] = _mm_sub_epi16(s[0], s[3]); + multiplication_and_add_2(&s[5], &s[6], &k__cospi_m16_p16, &k__cospi_p16_p16, + &t[5], &t[6]); + t[8] = _mm_add_epi16(s[8], s[11]); + t[9] = _mm_add_epi16(s[9], s[10]); + t[10] = _mm_sub_epi16(s[9], s[10]); + t[11] = _mm_sub_epi16(s[8], s[11]); + t[12] = _mm_sub_epi16(s[15], s[12]); + t[13] = _mm_sub_epi16(s[14], s[13]); + t[14] = _mm_add_epi16(s[13], s[14]); + t[15] = _mm_add_epi16(s[12], s[15]); + + // stage 6 + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + multiplication_and_add(&t[10], &t[13], &t[11], &t[12], &k__cospi_m16_p16, + &k__cospi_p16_p16, &k__cospi_m16_p16, + &k__cospi_p16_p16, &s[10], &s[13], &s[11], &s[12]); + + // stage 7 + in[0] = _mm_add_epi16(s[0], t[15]); + in[1] = _mm_add_epi16(s[1], t[14]); + in[2] = _mm_add_epi16(s[2], s[13]); + in[3] = _mm_add_epi16(s[3], s[12]); + in[4] = _mm_add_epi16(s[4], s[11]); + in[5] = _mm_add_epi16(s[5], s[10]); + in[6] = _mm_add_epi16(s[6], t[9]); + in[7] = _mm_add_epi16(s[7], t[8]); + in[8] = _mm_sub_epi16(s[7], t[8]); + in[9] = _mm_sub_epi16(s[6], t[9]); + in[10] = _mm_sub_epi16(s[5], s[10]); + in[11] = _mm_sub_epi16(s[4], s[11]); + in[12] = _mm_sub_epi16(s[3], s[12]); + in[13] = _mm_sub_epi16(s[2], s[13]); + in[14] = _mm_sub_epi16(s[1], t[14]); + in[15] = _mm_sub_epi16(s[0], t[15]); +} + +static INLINE void idct16_load8x8(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8 * 2); + in[2] = load_input_data(input + 8 * 4); + in[3] = load_input_data(input + 8 * 6); + in[4] = load_input_data(input + 8 * 8); + in[5] = load_input_data(input + 8 * 10); + in[6] = load_input_data(input + 8 * 12); + in[7] = load_input_data(input + 8 * 14); +} + void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[16], l[16], r[16], *curr1; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_8_0, stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + __m128i l[16], r[16], out[16], *in; int i; - curr1 = l; + in = l; for (i = 0; i < 2; i++) { - // 1-D idct - - // Load input data. - in[0] = load_input_data(input); - in[8] = load_input_data(input + 8 * 1); - in[1] = load_input_data(input + 8 * 2); - in[9] = load_input_data(input + 8 * 3); - in[2] = load_input_data(input + 8 * 4); - in[10] = load_input_data(input + 8 * 5); - in[3] = load_input_data(input + 8 * 6); - in[11] = load_input_data(input + 8 * 7); - in[4] = load_input_data(input + 8 * 8); - in[12] = load_input_data(input + 8 * 9); - in[5] = load_input_data(input + 8 * 10); - in[13] = load_input_data(input + 8 * 11); - in[6] = load_input_data(input + 8 * 12); - in[14] = load_input_data(input + 8 * 13); - in[7] = load_input_data(input + 8 * 14); - in[15] = load_input_data(input + 8 * 15); - + idct16_load8x8(input, in); transpose_16bit_8x8(in, in); + idct16_load8x8(input + 8, in + 8); transpose_16bit_8x8(in + 8, in + 8); - - IDCT16 - - // Stage7 - curr1[0] = _mm_add_epi16(stp2_0, stp1_15); - curr1[1] = _mm_add_epi16(stp2_1, stp1_14); - curr1[2] = _mm_add_epi16(stp2_2, stp2_13); - curr1[3] = _mm_add_epi16(stp2_3, stp2_12); - curr1[4] = _mm_add_epi16(stp2_4, stp2_11); - curr1[5] = _mm_add_epi16(stp2_5, stp2_10); - curr1[6] = _mm_add_epi16(stp2_6, stp1_9); - curr1[7] = _mm_add_epi16(stp2_7, stp1_8); - curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); - curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); - curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); - curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); - curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); - curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); - curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); - curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); - - curr1 = r; + idct16_8col(in); + in = r; input += 128; } + for (i = 0; i < 2; i++) { int j; - // 1-D idct - transpose_16bit_8x8(l + i * 8, in); - transpose_16bit_8x8(r + i * 8, in + 8); - - IDCT16 - - // 2-D - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); + transpose_16bit_8x8(l + i * 8, out); + transpose_16bit_8x8(r + i * 8, out + 8); + idct16_8col(out); + // Final rounding and shift for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - recon_and_store(dest + j * stride, in[j]); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + out[j] = _mm_adds_epi16(out[j], final_rounding); + out[j] = _mm_srai_epi16(out[j], 6); + recon_and_store(dest + j * stride, out[j]); } dest += 8; @@ -1249,179 +1220,6 @@ static void iadst16_8col(__m128i *in) { in[15] = _mm_sub_epi16(kZero, s[1]); } -static void idct16_8col(__m128i *in) { - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i u[16], s[16], t[16]; - - // stage 1 - s[0] = in[0]; - s[1] = in[8]; - s[2] = in[4]; - s[3] = in[12]; - s[4] = in[2]; - s[5] = in[10]; - s[6] = in[6]; - s[7] = in[14]; - s[8] = in[1]; - s[9] = in[9]; - s[10] = in[5]; - s[11] = in[13]; - s[12] = in[3]; - s[13] = in[11]; - s[14] = in[7]; - s[15] = in[15]; - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[15]); - u[1] = _mm_unpackhi_epi16(s[8], s[15]); - u[2] = _mm_unpacklo_epi16(s[9], s[14]); - u[3] = _mm_unpackhi_epi16(s[9], s[14]); - u[4] = _mm_unpacklo_epi16(s[10], s[13]); - u[5] = _mm_unpackhi_epi16(s[10], s[13]); - u[6] = _mm_unpacklo_epi16(s[11], s[12]); - u[7] = _mm_unpackhi_epi16(s[11], s[12]); - - s[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p30_m02); - s[15] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p02_p30); - s[9] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p14_m18); - s[14] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p18_p14); - s[10] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p22_m10); - s[13] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p10_p22); - s[11] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p06_m26); - s[12] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p26_p06); - - // stage 3 - t[0] = s[0]; - t[1] = s[1]; - t[2] = s[2]; - t[3] = s[3]; - u[0] = _mm_unpacklo_epi16(s[4], s[7]); - u[1] = _mm_unpackhi_epi16(s[4], s[7]); - u[2] = _mm_unpacklo_epi16(s[5], s[6]); - u[3] = _mm_unpackhi_epi16(s[5], s[6]); - - t[4] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p28_m04); - t[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p04_p28); - t[5] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p12_m20); - t[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p20_p12); - t[8] = _mm_add_epi16(s[8], s[9]); - t[9] = _mm_sub_epi16(s[8], s[9]); - t[10] = _mm_sub_epi16(s[11], s[10]); - t[11] = _mm_add_epi16(s[10], s[11]); - t[12] = _mm_add_epi16(s[12], s[13]); - t[13] = _mm_sub_epi16(s[12], s[13]); - t[14] = _mm_sub_epi16(s[15], s[14]); - t[15] = _mm_add_epi16(s[14], s[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(t[0], t[1]); - u[1] = _mm_unpackhi_epi16(t[0], t[1]); - u[2] = _mm_unpacklo_epi16(t[2], t[3]); - u[3] = _mm_unpackhi_epi16(t[2], t[3]); - u[4] = _mm_unpacklo_epi16(t[9], t[14]); - u[5] = _mm_unpackhi_epi16(t[9], t[14]); - u[6] = _mm_unpacklo_epi16(t[10], t[13]); - u[7] = _mm_unpackhi_epi16(t[10], t[13]); - - s[0] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16); - s[1] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16); - s[2] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p24_m08); - s[3] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p08_p24); - s[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m08_p24); - s[14] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p24_p08); - s[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m24_m08); - s[13] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m08_p24); - s[4] = _mm_add_epi16(t[4], t[5]); - s[5] = _mm_sub_epi16(t[4], t[5]); - s[6] = _mm_sub_epi16(t[7], t[6]); - s[7] = _mm_add_epi16(t[6], t[7]); - s[8] = t[8]; - s[15] = t[15]; - s[11] = t[11]; - s[12] = t[12]; - - // stage 5 - t[0] = _mm_add_epi16(s[0], s[3]); - t[1] = _mm_add_epi16(s[1], s[2]); - t[2] = _mm_sub_epi16(s[1], s[2]); - t[3] = _mm_sub_epi16(s[0], s[3]); - t[4] = s[4]; - t[7] = s[7]; - - multiplication_and_add_2(&s[5], &s[6], &k__cospi_m16_p16, &k__cospi_p16_p16, - &t[5], &t[6]); - - t[8] = _mm_add_epi16(s[8], s[11]); - t[9] = _mm_add_epi16(s[9], s[10]); - t[10] = _mm_sub_epi16(s[9], s[10]); - t[11] = _mm_sub_epi16(s[8], s[11]); - t[12] = _mm_sub_epi16(s[15], s[12]); - t[13] = _mm_sub_epi16(s[14], s[13]); - t[14] = _mm_add_epi16(s[13], s[14]); - t[15] = _mm_add_epi16(s[12], s[15]); - - // stage 6 - s[0] = _mm_add_epi16(t[0], t[7]); - s[1] = _mm_add_epi16(t[1], t[6]); - s[2] = _mm_add_epi16(t[2], t[5]); - s[3] = _mm_add_epi16(t[3], t[4]); - s[4] = _mm_sub_epi16(t[3], t[4]); - s[5] = _mm_sub_epi16(t[2], t[5]); - s[6] = _mm_sub_epi16(t[1], t[6]); - s[7] = _mm_sub_epi16(t[0], t[7]); - s[8] = t[8]; - s[9] = t[9]; - - u[0] = _mm_unpacklo_epi16(t[10], t[13]); - u[1] = _mm_unpackhi_epi16(t[10], t[13]); - u[2] = _mm_unpacklo_epi16(t[11], t[12]); - u[3] = _mm_unpackhi_epi16(t[11], t[12]); - - s[10] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_p16); - s[13] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16); - s[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16); - s[12] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16); - s[14] = t[14]; - s[15] = t[15]; - - // stage 7 - in[0] = _mm_add_epi16(s[0], s[15]); - in[1] = _mm_add_epi16(s[1], s[14]); - in[2] = _mm_add_epi16(s[2], s[13]); - in[3] = _mm_add_epi16(s[3], s[12]); - in[4] = _mm_add_epi16(s[4], s[11]); - in[5] = _mm_add_epi16(s[5], s[10]); - in[6] = _mm_add_epi16(s[6], s[9]); - in[7] = _mm_add_epi16(s[7], s[8]); - in[8] = _mm_sub_epi16(s[7], s[8]); - in[9] = _mm_sub_epi16(s[6], s[9]); - in[10] = _mm_sub_epi16(s[5], s[10]); - in[11] = _mm_sub_epi16(s[4], s[11]); - in[12] = _mm_sub_epi16(s[3], s[12]); - in[13] = _mm_sub_epi16(s[2], s[13]); - in[14] = _mm_sub_epi16(s[1], s[14]); - in[15] = _mm_sub_epi16(s[0], s[15]); -} - void idct16_sse2(__m128i *in0, __m128i *in1) { transpose_16bit_16x16(in0, in1); idct16_8col(in0);