NEON asm of vpx_lpf_{horizontal,vertical}_8_dual_neon()

Also expose the NEON intrinsics version. BUG=webm:1261, webm:1266. Change-Id: I8c4ae658467dcf66ebf7a75982b2ef712dbb4535
2016-08-12 18:14:21 -07:00 · 2016-08-12 18:14:21 -07:00 · f9efbad392
commit f9efbad392
parent 5d881770e5
5 changed files with 94 additions and 36 deletions
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@ -531,16 +531,12 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8),
        make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8),
        make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8)));
-INSTANTIATE_TEST_CASE_P(NEON, Loop8Test9Param,
-                        ::testing::Values(
-// Using #if inside the macro is unsupported on MSVS but the tests are not
-// currently built for MSVS with ARM and NEON.
-#if HAVE_NEON_ASM
-                            make_tuple(&vpx_lpf_horizontal_8_dual_neon,
+INSTANTIATE_TEST_CASE_P(
+    NEON, Loop8Test9Param,
+    ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_neon,
                                 &vpx_lpf_horizontal_8_dual_c, 8),
                      make_tuple(&vpx_lpf_vertical_8_dual_neon,
                                 &vpx_lpf_vertical_8_dual_c, 8),
-#endif  // HAVE_NEON_ASM
                      make_tuple(&vpx_lpf_horizontal_4_dual_neon,
                                 &vpx_lpf_horizontal_4_dual_c, 8),
                      make_tuple(&vpx_lpf_vertical_4_dual_neon,
--- a/vpx_dsp/arm/loopfilter_8_neon.asm
+++ b/vpx_dsp/arm/loopfilter_8_neon.asm
@ -9,7 +9,9 @@
 ;

    EXPORT  |vpx_lpf_horizontal_8_neon|
+    EXPORT  |vpx_lpf_horizontal_8_dual_neon|
    EXPORT  |vpx_lpf_vertical_8_neon|
+    EXPORT  |vpx_lpf_vertical_8_dual_neon|
    ARM

    AREA ||.text||, CODE, READONLY, ALIGN=2
@ -64,6 +66,38 @@

    ENDP        ; |vpx_lpf_horizontal_8_neon|

+;void vpx_lpf_horizontal_8_dual_neon(uint8_t *s,
+;                                    int p,
+;                                    const uint8_t *blimit0,
+;                                    const uint8_t *limit0,
+;                                    const uint8_t *thresh0,
+;                                    const uint8_t *blimit1,
+;                                    const uint8_t *limit1,
+;                                    const uint8_t *thresh1)
+; r0      uint8_t *s,
+; r1      int p, /* pitch */
+; r2      const uint8_t *blimit0,
+; r3      const uint8_t *limit0,
+; sp      const uint8_t *thresh0,
+; sp + 4  const uint8_t *blimit1,
+; sp + 8  const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_horizontal_8_dual_neon| PROC
+    push        {r0-r1, lr}
+    ldr         lr, [sp, #12]
+    push        {lr}                       ; thresh0
+    bl          vpx_lpf_horizontal_8_neon
+
+    ldr         r2, [sp, #20]              ; blimit1
+    ldr         r3, [sp, #24]              ; limit1
+    ldr         lr, [sp, #28]
+    str         lr, [sp, #16]              ; thresh1
+    add         sp, #4
+    pop         {r0-r1, lr}
+    add         r0, #8                     ; s + 8
+    b           vpx_lpf_horizontal_8_neon
+    ENDP        ; |vpx_lpf_horizontal_8_dual_neon|
+
 ; void vpx_lpf_vertical_8_neon(uint8_t *s,
 ;                              int pitch,
 ;                              const uint8_t *blimit,
@ -139,6 +173,38 @@
    pop         {r4-r5, pc}
    ENDP        ; |vpx_lpf_vertical_8_neon|

+;void vpx_lpf_vertical_8_dual_neon(uint8_t *s,
+;                                  int pitch,
+;                                  const uint8_t *blimit0,
+;                                  const uint8_t *limit0,
+;                                  const uint8_t *thresh0,
+;                                  const uint8_t *blimit1,
+;                                  const uint8_t *limit1,
+;                                  const uint8_t *thresh1)
+; r0      uint8_t *s,
+; r1      int pitch
+; r2      const uint8_t *blimit0,
+; r3      const uint8_t *limit0,
+; sp      const uint8_t *thresh0,
+; sp + 4  const uint8_t *blimit1,
+; sp + 8  const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_vertical_8_dual_neon| PROC
+    push        {r0-r1, lr}
+    ldr         lr, [sp, #12]
+    push        {lr}                       ; thresh0
+    bl          vpx_lpf_vertical_8_neon
+
+    ldr         r2, [sp, #20]              ; blimit1
+    ldr         r3, [sp, #24]              ; limit1
+    ldr         lr, [sp, #28]
+    str         lr, [sp, #16]              ; thresh1
+    add         sp, #4
+    pop         {r0-r1, lr}
+    add         r0, r1, lsl #3             ; s + 8 * pitch
+    b           vpx_lpf_vertical_8_neon
+    ENDP        ; |vpx_lpf_vertical_8_dual_neon|
+
 ; void vpx_mbloop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
--- a/vpx_dsp/arm/loopfilter_8_neon.c
+++ b/vpx_dsp/arm/loopfilter_8_neon.c
@ -311,6 +311,14 @@ void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
  return;
 }

+void vpx_lpf_horizontal_8_dual_neon(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
+}
+
 void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
  int i;
@ -427,3 +435,11 @@ void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
  }
  return;
 }
+
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+}
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@ -21,21 +21,3 @@ void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 }
-
-#if HAVE_NEON_ASM
-void vpx_lpf_horizontal_8_dual_neon(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-#endif  // HAVE_NEON_ASM
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@ -514,8 +514,7 @@ add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *bl
 specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;

 add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
-$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/;

 add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
@ -533,8 +532,7 @@ add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *
 specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;

 add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
-$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/;

 add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;