diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 98e18004b..c7d26203a 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -65,16 +65,11 @@ int16_t MaxSupportedCoeff(InvTxfmFunc a) { int16_t MinSupportedCoeff(InvTxfmFunc a) { (void)a; -#if !CONFIG_EMULATE_HARDWARE -#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \ + !CONFIG_EMULATE_HARDWARE if (a == vpx_idct8x8_64_add_ssse3 || a == vpx_idct8x8_12_add_ssse3) { return -23625 + 1; } -#elif HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH - if (a == vpx_idct4x4_16_add_neon) { - return std::numeric_limits::min() + 1; - } -#endif #endif // !CONFIG_EMULATE_HARDWARE return std::numeric_limits::min(); } diff --git a/vpx_dsp/arm/idct4x4_add_neon.asm b/vpx_dsp/arm/idct4x4_add_neon.asm index bd4e86ded..cb96d4f94 100644 --- a/vpx_dsp/arm/idct4x4_add_neon.asm +++ b/vpx_dsp/arm/idct4x4_add_neon.asm @@ -72,16 +72,15 @@ ; do the transform on transposed rows ; stage 1 - vadd.s16 d23, d16, d18 ; (input[0] + input[2]) - vsub.s16 d24, d16, d18 ; (input[0] - input[2]) - vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 ; (input[0] + input[2]) * cospi_16_64; ; (input[0] - input[2]) * cospi_16_64; - vmull.s16 q13, d23, d21 - vmull.s16 q14, d24, d21 + vmull.s16 q8, d16, d21 + vmull.s16 q14, d18, d21 + vadd.s32 q13, q8, q14 + vsub.s32 q14, q8, q14 ; input[1] * cospi_24_64 - input[3] * cospi_8_64; ; input[1] * cospi_8_64 + input[3] * cospi_24_64; diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c index 66ef26ad8..dd75dc08e 100644 --- a/vpx_dsp/arm/idct4x4_add_neon.c +++ b/vpx_dsp/arm/idct4x4_add_neon.c @@ -18,7 +18,7 @@ static INLINE void idct4x4_16_kernel(const int16x4_t cospis, int16x8_t *a0, int16x8_t *a1) { - int16x4_t b0, b1, b2, b3, b4, b5; + int16x4_t b0, b1, b2, b3; int32x4_t c0, c1, c2, c3; int16x8_t d0, d1; @@ -27,10 +27,10 @@ static INLINE void idct4x4_16_kernel(const int16x4_t cospis, int16x8_t *a0, b1 = vget_high_s16(*a0); b2 = vget_low_s16(*a1); b3 = vget_high_s16(*a1); - b4 = vadd_s16(b0, b1); - b5 = vsub_s16(b0, b1); - c0 = vmull_lane_s16(b4, cospis, 2); - c1 = vmull_lane_s16(b5, cospis, 2); + c0 = vmull_lane_s16(b0, cospis, 2); + c2 = vmull_lane_s16(b1, cospis, 2); + c1 = vsubq_s32(c0, c2); + c0 = vaddq_s32(c0, c2); c2 = vmull_lane_s16(b2, cospis, 3); c3 = vmull_lane_s16(b2, cospis, 1); c2 = vmlsl_lane_s16(c2, b3, cospis, 1);