diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index d6fa90b82..b073b1bd6 100644 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl @@ -708,7 +708,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_fdct4x4_1 sse2/; add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct8x8 sse2/; + specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64"; add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct8x8_1 sse2/; diff --git a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm index 6f3c47083..5b2aab215 100644 --- a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm +++ b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -130,12 +130,30 @@ SECTION .text psraw m%2, 1 %endmacro +%macro STORE_OUTPUT 2 ; index, result +%if CONFIG_AOM_HIGHBITDEPTH + ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + ; _mm_store_si128((__m128i *)(dst_ptr), out0); + ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1); + pxor m11, m11 + pcmpgtw m11, m%2 + movdqa m12, m%2 + punpcklwd m%2, m11 + punpckhwd m12, m11 + mova [outputq + 4*%1 + 0], m%2 + mova [outputq + 4*%1 + 16], m12 +%else + mova [outputq + 2*%1], m%2 +%endif +%endmacro + INIT_XMM ssse3 cglobal fdct8x8, 3, 5, 13, input, output, stride mova m8, [pd_8192] mova m12, [pw_11585x2] - pxor m11, m11 lea r3, [2 * strideq] lea r4, [4 * strideq] @@ -173,14 +191,14 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride DIVIDE_ROUND_2X 4, 5, 9, 10 DIVIDE_ROUND_2X 6, 7, 9, 10 - mova [outputq + 0], m0 - mova [outputq + 16], m1 - mova [outputq + 32], m2 - mova [outputq + 48], m3 - mova [outputq + 64], m4 - mova [outputq + 80], m5 - mova [outputq + 96], m6 - mova [outputq + 112], m7 + STORE_OUTPUT 0, 0 + STORE_OUTPUT 8, 1 + STORE_OUTPUT 16, 2 + STORE_OUTPUT 24, 3 + STORE_OUTPUT 32, 4 + STORE_OUTPUT 40, 5 + STORE_OUTPUT 48, 6 + STORE_OUTPUT 56, 7 RET %endif diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 9f62ffe25..bbfb7f1a2 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -728,8 +728,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&idct8x8_12, &idct8x8_64_add_12_sse2, 6225, AOM_BITS_12))); #endif // HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_AOM_HIGHBITDEPTH && \ - !CONFIG_EMULATE_HARDWARE +#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT, ::testing::Values(make_tuple(&aom_fdct8x8_ssse3, &aom_idct8x8_64_add_ssse3,