Refine 8-bit 16x16 idct NEON intrinsics

Speed test shows 25% gain on vpx_idct16x16_256_add_neon(), and vpx_idct16x16_10_add_neon() got trippled. Change-Id: If8518d9b6a3efab74031297b8d40cd83c4a49541
2017-01-06 17:52:07 -08:00 · 2017-01-06 17:52:07 -08:00 · 6abdd31555
commit 6abdd31555
parent c7e2bd6298
4 changed files with 853 additions and 1218 deletions
--- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@ -131,8 +131,6 @@ static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,

 void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int bd) {
-  DECLARE_ALIGNED(16, static const int32_t, kCospi32[4]) = { 0, 15137, 11585,
-                                                             6270 };
  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
  int32x4_t c0 = vld1q_s32(input);
  int32x4_t c1 = vld1q_s32(input + 4);
--- a/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/vpx_dsp/arm/idct16x16_add_neon.c
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@ -17,17 +17,21 @@
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/vpx_dsp_common.h"

-DECLARE_ALIGNED(16, static const int16_t, kCospi[8]) = {
-  16384 /*  cospi_0_64 */,  15137 /* cospi_8_64 */,
-  11585 /*  cospi_16_64 */, 6270 /* cospi_24_64 */,
-  16069 /*  cospi_4_64 */,  13623 /* cospi_12_64 */,
-  -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */
+DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
+  16384 /*  cospi_0_64  */, 15137 /*  cospi_8_64  */,
+  11585 /*  cospi_16_64 */, 6270 /*  cospi_24_64 */,
+  16069 /*  cospi_4_64  */, 13623 /*  cospi_12_64 */,
+  -9102 /* -cospi_20_64 */, 3196 /*  cospi_28_64 */,
+  16305 /*  cospi_2_64  */, 1606 /*  cospi_30_64 */,
+  14449 /*  cospi_10_64 */, 7723 /*  cospi_22_64 */,
+  15679 /*  cospi_6_64  */, -4756 /* -cospi_26_64 */,
+  12665 /*  cospi_14_64 */, -10394 /* -cospi_18_64 */
 };

 DECLARE_ALIGNED(16, static const int32_t, kCospi32[8]) = {
-  16384 /*  cospi_0_64 */,  15137 /* cospi_8_64 */,
+  16384 /*  cospi_0_64  */, 15137 /* cospi_8_64  */,
  11585 /*  cospi_16_64 */, 6270 /* cospi_24_64 */,
-  16069 /*  cospi_4_64 */,  13623 /* cospi_12_64 */,
+  16069 /*  cospi_4_64  */, 13623 /* cospi_12_64 */,
  -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */
 };

@ -462,4 +466,79 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
  *io7 = vsubq_s16(step1[0], step2[7]);
 }

+static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32,
+                                              int16x8_t *const d0,
+                                              int16x8_t *const d1) {
+  int16x4_t t16[4];
+
+  t16[0] = vrshrn_n_s32(t32[0], 14);
+  t16[1] = vrshrn_n_s32(t32[1], 14);
+  t16[2] = vrshrn_n_s32(t32[2], 14);
+  t16[3] = vrshrn_n_s32(t32[3], 14);
+  *d0 = vcombine_s16(t16[0], t16[1]);
+  *d1 = vcombine_s16(t16[2], t16[3]);
+}
+
+static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
+                                            const int16x8_t s1,
+                                            const int16x4_t cospi_0_8_16_24,
+                                            int32x4_t *const t32) {
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3);
+  t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1);
+  t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1);
+}
+
+static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1,
+                                     const int16x4_t cospi_0_8_16_24,
+                                     int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x4_t cospi_0_8_16_24,
+                                         int16x8_t *const d0,
+                                         int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+  t32[2] = vnegq_s32(t32[2]);
+  t32[3] = vnegq_s32(t32[3]);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x4_t cospi_0_8_16_24,
+                                      int16x8_t *const d0,
+                                      int16x8_t *const d1) {
+  int32x4_t t32[6];
+
+  t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2);
+  t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2);
+  t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+  t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+  t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+  t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest,
+                                    const int stride) {
+  uint8x8_t d = vld1_u8(*dest);
+  uint16x8_t q;
+
+  res = vrshrq_n_s16(res, 6);
+  q = vaddw_u8(vreinterpretq_u16_s16(res), d);
+  d = vqmovun_s16(vreinterpretq_s16_u16(q));
+  vst1_u8(*dest, d);
+  *dest += stride;
+}
+
 #endif  // VPX_DSP_ARM_IDCT_NEON_H_
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@ -227,6 +227,7 @@ DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
 DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
 DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
 DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct16x16_neon.c
 else
 DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c
@ -236,7 +237,6 @@ DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c
 endif  # HAVE_NEON_ASM
 DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
-DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c