enable vpx_idct32x32_1024_add_neon in hbd builds

BUG=webm:1294 Change-Id: Ibdda54e6d1303b0f73bc7bc71417e4041d7618de
2016-12-06 20:52:34 -08:00
parent 5d4aa325a6
commit 86e340c76e
3 changed files with 98 additions and 46 deletions
--- a/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_add_neon.c
@@ -12,6 +12,7 @@

 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"

@@ -150,52 +151,98 @@ static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
  *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14));
 }

+static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1,
+                               int16x8_t *s2, int16x8_t *s3, int16x8_t *s4,
+                               int16x8_t *s5, int16x8_t *s6, int16x8_t *s7) {
+  *s0 = vld1q_s16(in);
+  in += 32;
+  *s1 = vld1q_s16(in);
+  in += 32;
+  *s2 = vld1q_s16(in);
+  in += 32;
+  *s3 = vld1q_s16(in);
+  in += 32;
+  *s4 = vld1q_s16(in);
+  in += 32;
+  *s5 = vld1q_s16(in);
+  in += 32;
+  *s6 = vld1q_s16(in);
+  in += 32;
+  *s7 = vld1q_s16(in);
+}
+
+static INLINE void transpose_and_store_s16_8x8(int16x8_t a0, int16x8_t a1,
+                                               int16x8_t a2, int16x8_t a3,
+                                               int16x8_t a4, int16x8_t a5,
+                                               int16x8_t a6, int16x8_t a7,
+                                               int16_t **out) {
+  transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  vst1q_s16(*out, a0);
+  *out += 8;
+  vst1q_s16(*out, a1);
+  *out += 8;
+  vst1q_s16(*out, a2);
+  *out += 8;
+  vst1q_s16(*out, a3);
+  *out += 8;
+  vst1q_s16(*out, a4);
+  *out += 8;
+  vst1q_s16(*out, a5);
+  *out += 8;
+  vst1q_s16(*out, a6);
+  *out += 8;
+  vst1q_s16(*out, a7);
+  *out += 8;
+}
+
 static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) {
-  const int16_t *in;
  int i;
-  const int stride = 32;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;

  for (i = 0; i < 4; i++, input += 8) {
-    in = input;
-    q8s16 = vld1q_s16(in);
-    in += stride;
-    q9s16 = vld1q_s16(in);
-    in += stride;
-    q10s16 = vld1q_s16(in);
-    in += stride;
-    q11s16 = vld1q_s16(in);
-    in += stride;
-    q12s16 = vld1q_s16(in);
-    in += stride;
-    q13s16 = vld1q_s16(in);
-    in += stride;
-    q14s16 = vld1q_s16(in);
-    in += stride;
-    q15s16 = vld1q_s16(in);
-
-    transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                      &q14s16, &q15s16);
-
-    vst1q_s16(t_buf, q8s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q9s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q10s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q11s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q12s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q13s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q14s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q15s16);
-    t_buf += 8;
+    load_s16x8q(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
  }
 }

+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void load_s16x8q_tran_low(const tran_low_t *in, int16x8_t *s0,
+                                        int16x8_t *s1, int16x8_t *s2,
+                                        int16x8_t *s3, int16x8_t *s4,
+                                        int16x8_t *s5, int16x8_t *s6,
+                                        int16x8_t *s7) {
+  *s0 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s1 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s2 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s3 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s4 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s5 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s6 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s7 = load_tran_low_to_s16q(in);
+}
+
+static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input,
+                                                  int16_t *t_buf) {
+  int i;
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  for (i = 0; i < 4; i++, input += 8) {
+    load_s16x8q_tran_low(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
+  }
+}
+#else  // !CONFIG_VP9_HIGHBITDEPTH
+#define idct32_transpose_pair_tran_low idct32_transpose_pair
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
                                             int16x8_t q3s16, int16x8_t q6s16,
                                             int16x8_t q7s16, int16x8_t q8s16,
@@ -383,16 +430,22 @@ void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
  int16_t trans_buf[32 * 8];
  int16_t pass1[32 * 32];
  int16_t pass2[32 * 32];
-  int16_t *out;
+  int16_t *in, *out;
  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

  for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
       idct32_pass_loop++,
-      input = pass1,  // the input of pass2 is the result of pass1
+      in = pass1,  // the input of pass2 is the result of pass1
       out = pass2) {
-    for (i = 0; i < 4; i++, input += 32 * 8, out += 8) {  // idct32_bands_loop
-      idct32_transpose_pair(input, trans_buf);
+    for (i = 0; i < 4; i++, out += 8) {  // idct32_bands_loop
+      if (idct32_pass_loop == 0) {
+        idct32_transpose_pair_tran_low(input, trans_buf);
+        input += 32 * 8;
+      } else {
+        idct32_transpose_pair(in, trans_buf);
+        in += 32 * 8;
+      }

      // -----------------------------------------
      // BLOCK A: 16-19,28-31
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -201,8 +201,6 @@ endif  # ARCH_X86_64
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)

 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_NEON)  += arm/idct32x32_add_neon.c
-
 DSP_SRCS-$(HAVE_MSA)   += mips/inv_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
 DSP_SRCS-$(HAVE_MSA)   += mips/idct8x8_msa.c
@@ -240,6 +238,7 @@ DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c

 endif  # CONFIG_VP9

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -696,7 +696,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    specialize qw/vpx_idct16x16_1_add neon sse2/;

    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64";
+    specialize qw/vpx_idct32x32_1024_add neon sse2/, "$ssse3_x86_64";

    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
    specialize qw/vpx_idct32x32_135_add neon sse2/, "$ssse3_x86_64";