diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 98e18004b..c7d26203a 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -65,16 +65,11 @@ int16_t MaxSupportedCoeff(InvTxfmFunc a) {
 
 int16_t MinSupportedCoeff(InvTxfmFunc a) {
   (void)a;
-#if !CONFIG_EMULATE_HARDWARE
-#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
+    !CONFIG_EMULATE_HARDWARE
   if (a == vpx_idct8x8_64_add_ssse3 || a == vpx_idct8x8_12_add_ssse3) {
     return -23625 + 1;
   }
-#elif HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
-  if (a == vpx_idct4x4_16_add_neon) {
-    return std::numeric_limits<int16_t>::min() + 1;
-  }
-#endif
 #endif  // !CONFIG_EMULATE_HARDWARE
   return std::numeric_limits<int16_t>::min();
 }
diff --git a/vpx_dsp/arm/idct4x4_add_neon.asm b/vpx_dsp/arm/idct4x4_add_neon.asm
index bd4e86ded..cb96d4f94 100644
--- a/vpx_dsp/arm/idct4x4_add_neon.asm
+++ b/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -72,16 +72,15 @@
     ; do the transform on transposed rows
 
     ; stage 1
-    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
-    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
-
     vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
     vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
 
     ; (input[0] + input[2]) * cospi_16_64;
     ; (input[0] - input[2]) * cospi_16_64;
-    vmull.s16 q13, d23, d21
-    vmull.s16 q14, d24, d21
+    vmull.s16 q8,  d16, d21
+    vmull.s16 q14, d18, d21
+    vadd.s32  q13, q8,  q14
+    vsub.s32  q14, q8,  q14
 
     ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
     ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c
index 66ef26ad8..dd75dc08e 100644
--- a/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_add_neon.c
@@ -18,7 +18,7 @@
 
 static INLINE void idct4x4_16_kernel(const int16x4_t cospis, int16x8_t *a0,
                                      int16x8_t *a1) {
-  int16x4_t b0, b1, b2, b3, b4, b5;
+  int16x4_t b0, b1, b2, b3;
   int32x4_t c0, c1, c2, c3;
   int16x8_t d0, d1;
 
@@ -27,10 +27,10 @@ static INLINE void idct4x4_16_kernel(const int16x4_t cospis, int16x8_t *a0,
   b1 = vget_high_s16(*a0);
   b2 = vget_low_s16(*a1);
   b3 = vget_high_s16(*a1);
-  b4 = vadd_s16(b0, b1);
-  b5 = vsub_s16(b0, b1);
-  c0 = vmull_lane_s16(b4, cospis, 2);
-  c1 = vmull_lane_s16(b5, cospis, 2);
+  c0 = vmull_lane_s16(b0, cospis, 2);
+  c2 = vmull_lane_s16(b1, cospis, 2);
+  c1 = vsubq_s32(c0, c2);
+  c0 = vaddq_s32(c0, c2);
   c2 = vmull_lane_s16(b2, cospis, 3);
   c3 = vmull_lane_s16(b2, cospis, 1);
   c2 = vmlsl_lane_s16(c2, b3, cospis, 1);