VP9 common for ARMv8 by using NEON intrinsics 14

Add vp9_idct16x16_add_neon.c - vp9_idct16x16_256_add_neon_pass1 - vp9_idct16x16_256_add_neon_pass2 - vp9_idct16x16_10_add_neon_pass1 - vp9_idct16x16_10_add_neon_pass2 Change-Id: I54d25b54a36f4371760f54e4036693aaea40a5de Signed-off-by: James Yu <james.yu@linaro.org>
2014-02-08 01:52:15 +08:00 · 2014-02-08 01:52:15 +08:00 · 3cfed4bf76
commit 3cfed4bf76
parent ce76aeb00d
5 changed files with 1350 additions and 6 deletions
--- a/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
--- a/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm
+++ b/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@ -30,18 +30,24 @@ void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
                                     uint8_t *dest,
                                     int dest_stride);

+#if HAVE_NEON_ASM
 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
 extern void vp9_push_neon(int64_t *store);
 extern void vp9_pop_neon(int64_t *store);
+#endif  // HAVE_NEON_ASM

 void vp9_idct16x16_256_add_neon(const int16_t *input,
                                uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
  int64_t store_reg[8];
+#endif
  int16_t pass1_output[16*16] = {0};
  int16_t row_idct_output[16*16] = {0};

+#if HAVE_NEON_ASM
  // save d8-d15 register values.
  vp9_push_neon(store_reg);
+#endif

  /* Parallel idct on the upper 8 rows */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@ -103,20 +109,26 @@ void vp9_idct16x16_256_add_neon(const int16_t *input,
                                     dest+8,
                                     dest_stride);

+#if HAVE_NEON_ASM
  // restore d8-d15 register values.
  vp9_pop_neon(store_reg);
+#endif

  return;
 }

 void vp9_idct16x16_10_add_neon(const int16_t *input,
                               uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
  int64_t store_reg[8];
+#endif
  int16_t pass1_output[16*16] = {0};
  int16_t row_idct_output[16*16] = {0};

+#if HAVE_NEON_ASM
  // save d8-d15 register values.
  vp9_push_neon(store_reg);
+#endif

  /* Parallel idct on the upper 8 rows */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@ -165,8 +177,10 @@ void vp9_idct16x16_10_add_neon(const int16_t *input,
                                     dest+8,
                                     dest_stride);

+#if HAVE_NEON_ASM
  // restore d8-d15 register values.
  vp9_pop_neon(store_reg);
+#endif

  return;
 }
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -440,12 +440,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/;

    add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
-    $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+    specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon dspr2/;

    add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
-    $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+    specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon dspr2/;

    add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@ -131,10 +131,8 @@ ifeq ($(ARCH_X86_64), yes)
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm
 endif

-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM)
@ -151,6 +149,8 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)
@ -166,6 +166,8 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c