diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index 962cddb51..7e43c696a 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -145,9 +145,6 @@ TEST_P(Hadamard8x8Test, VaryStride) { INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_c)); -// TODO(jingning): Remove highbitdepth flag when the SIMD functions are -// in place and turn on the unit test. -#if !CONFIG_VP9_HIGHBITDEPTH #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_sse2)); @@ -163,6 +160,9 @@ INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_neon)); #endif // HAVE_NEON +// TODO(jingning): Remove highbitdepth flag when the SIMD functions are +// in place and turn on the unit test. +#if !CONFIG_VP9_HIGHBITDEPTH #if HAVE_MSA INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_msa)); @@ -212,7 +212,6 @@ TEST_P(Hadamard16x16Test, VaryStride) { } } -#if !CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_c)); @@ -226,6 +225,7 @@ INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_neon)); #endif // HAVE_NEON +#if !CONFIG_VP9_HIGHBITDEPTH #if HAVE_MSA INSTANTIATE_TEST_CASE_P(MSA, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_msa)); diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c index 977323497..ebeafed31 100644 --- a/vpx_dsp/arm/hadamard_neon.c +++ b/vpx_dsp/arm/hadamard_neon.c @@ -11,6 +11,8 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/transpose_neon.h" static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, @@ -45,7 +47,7 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, } void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { int16x8_t a0 = vld1q_s16(src_diff); int16x8_t a1 = vld1q_s16(src_diff + src_stride); int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); @@ -63,18 +65,18 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, // Skip the second transpose because it is not required. - vst1q_s16(coeff + 0, a0); - vst1q_s16(coeff + 8, a1); - vst1q_s16(coeff + 16, a2); - vst1q_s16(coeff + 24, a3); - vst1q_s16(coeff + 32, a4); - vst1q_s16(coeff + 40, a5); - vst1q_s16(coeff + 48, a6); - vst1q_s16(coeff + 56, a7); + store_s16q_to_tran_low(coeff + 0, a0); + store_s16q_to_tran_low(coeff + 8, a1); + store_s16q_to_tran_low(coeff + 16, a2); + store_s16q_to_tran_low(coeff + 24, a3); + store_s16q_to_tran_low(coeff + 32, a4); + store_s16q_to_tran_low(coeff + 40, a5); + store_s16q_to_tran_low(coeff + 48, a6); + store_s16q_to_tran_low(coeff + 56, a7); } void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { int i; /* Rearrange 16x16 to 8x32 and remove stride. @@ -88,10 +90,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); for (i = 0; i < 64; i += 8) { - const int16x8_t a0 = vld1q_s16(coeff + 0); - const int16x8_t a1 = vld1q_s16(coeff + 64); - const int16x8_t a2 = vld1q_s16(coeff + 128); - const int16x8_t a3 = vld1q_s16(coeff + 192); + const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0); + const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64); + const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128); + const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192); const int16x8_t b0 = vhaddq_s16(a0, a1); const int16x8_t b1 = vhsubq_s16(a0, a1); @@ -103,10 +105,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, const int16x8_t c2 = vsubq_s16(b0, b2); const int16x8_t c3 = vsubq_s16(b1, b3); - vst1q_s16(coeff + 0, c0); - vst1q_s16(coeff + 64, c1); - vst1q_s16(coeff + 128, c2); - vst1q_s16(coeff + 192, c3); + store_s16q_to_tran_low(coeff + 0, c0); + store_s16q_to_tran_low(coeff + 64, c1); + store_s16q_to_tran_low(coeff + 128, c2); + store_s16q_to_tran_low(coeff + 192, c3); coeff += 8; } diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h index d9b85223c..2f30a5add 100644 --- a/vpx_dsp/arm/idct_neon.h +++ b/vpx_dsp/arm/idct_neon.h @@ -76,6 +76,17 @@ static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { #endif } +static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); + const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); + vst1q_s32(buf, v0); + vst1q_s32(buf + 4, v1); +#else + vst1q_s16(buf, a); +#endif +} + //------------------------------------------------------------------------------ // Multiply a by a_const. Saturate, shift and narrow by 14. diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 3cb2011b8..d23a5e7bf 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -888,10 +888,10 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_8x8/; + specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64"; add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_16x16/; + specialize qw/vpx_hadamard_16x16 sse2 neon/; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; specialize qw/vpx_satd/; diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c index b0a104bad..955d9ceab 100644 --- a/vpx_dsp/x86/avg_intrin_sse2.c +++ b/vpx_dsp/x86/avg_intrin_sse2.c @@ -11,6 +11,8 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/fdct.h" #include "vpx_ports/mem.h" void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, @@ -213,7 +215,7 @@ static void hadamard_col8_sse2(__m128i *in, int iter) { } void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { __m128i src[8]; src[0] = _mm_load_si128((const __m128i *)src_diff); src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); @@ -227,25 +229,25 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, hadamard_col8_sse2(src, 0); hadamard_col8_sse2(src, 1); - _mm_store_si128((__m128i *)coeff, src[0]); + store_tran_low(src[0], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[1]); + store_tran_low(src[1], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[2]); + store_tran_low(src[2], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[3]); + store_tran_low(src[3], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[4]); + store_tran_low(src[4], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[5]); + store_tran_low(src[5], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[6]); + store_tran_low(src[6], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[7]); + store_tran_low(src[7], coeff); } void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { int16_t const *src_ptr = @@ -254,10 +256,10 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, } for (idx = 0; idx < 64; idx += 8) { - __m128i coeff0 = _mm_load_si128((const __m128i *)coeff); - __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64)); - __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128)); - __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192)); + __m128i coeff0 = load_tran_low(coeff); + __m128i coeff1 = load_tran_low(coeff + 64); + __m128i coeff2 = load_tran_low(coeff + 128); + __m128i coeff3 = load_tran_low(coeff + 192); __m128i b0 = _mm_add_epi16(coeff0, coeff1); __m128i b1 = _mm_sub_epi16(coeff0, coeff1); @@ -271,13 +273,13 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); - _mm_store_si128((__m128i *)coeff, coeff0); - _mm_store_si128((__m128i *)(coeff + 64), coeff1); + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 64); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); - _mm_store_si128((__m128i *)(coeff + 128), coeff2); - _mm_store_si128((__m128i *)(coeff + 192), coeff3); + store_tran_low(coeff2, coeff + 128); + store_tran_low(coeff3, coeff + 192); coeff += 8; } diff --git a/vpx_dsp/x86/avg_ssse3_x86_64.asm b/vpx_dsp/x86/avg_ssse3_x86_64.asm index 36d38da3c..d170a4453 100644 --- a/vpx_dsp/x86/avg_ssse3_x86_64.asm +++ b/vpx_dsp/x86/avg_ssse3_x86_64.asm @@ -8,8 +8,6 @@ ; be found in the AUTHORS file in the root of the source tree. ; -%define private_prefix vpx - %include "third_party/x86inc/x86inc.asm" SECTION .text @@ -96,6 +94,21 @@ SECTION .text SWAP 7, 9 %endmacro +%if CONFIG_VP9_HIGHBITDEPTH +; store %1 to outputq + %2 +; uses m8-m10 as scratch registers +%macro STORE_TRAN_LOW 2 + pxor m8, m8 + mova m9, m%1 + mova m10, m%1 + pcmpgtw m8, m%1 + punpcklwd m9, m8 + punpckhwd m10, m8 + mova [outputq + %2], m9 + mova [outputq + %2 + 16], m10 +%endmacro +%endif + INIT_XMM ssse3 cglobal hadamard_8x8, 3, 5, 11, input, stride, output lea r3, [2 * strideq] @@ -117,6 +130,16 @@ cglobal hadamard_8x8, 3, 5, 11, input, stride, output TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 HMD8_1D +%if CONFIG_VP9_HIGHBITDEPTH + STORE_TRAN_LOW 0, 0 + STORE_TRAN_LOW 1, 32 + STORE_TRAN_LOW 2, 64 + STORE_TRAN_LOW 3, 96 + STORE_TRAN_LOW 4, 128 + STORE_TRAN_LOW 5, 160 + STORE_TRAN_LOW 6, 192 + STORE_TRAN_LOW 7, 224 +%else mova [outputq + 0], m0 mova [outputq + 16], m1 mova [outputq + 32], m2 @@ -125,6 +148,7 @@ cglobal hadamard_8x8, 3, 5, 11, input, stride, output mova [outputq + 80], m5 mova [outputq + 96], m6 mova [outputq + 112], m7 +%endif RET %endif