From 45a7b5ebd7fcf7b329710e3f347ce40bd2bf6a84 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 12 Feb 2016 19:32:05 -0800
Subject: [PATCH 01/16] lpf_8_test: simplify function wrapper generation

Change-Id: Ie4d3e80a4e43dd4ada78d073e308e10db4ea3239
---
 test/lpf_8_test.cc | 159 ++++++++++++---------------------------------
 1 file changed, 43 insertions(+), 116 deletions(-)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 0bf6b0c23..c582bc34d 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -42,6 +42,17 @@ typedef void (*dual_loop_op_t)(uint16_t *s, int p, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1, int bd);
+
+// wrapper for loopfilter functions without a 'count' param.
+typedef void (*loop_op_nc_t)(uint16_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh,
+                             int bd);
+template <loop_op_nc_t fn>
+void wrapper_nc(uint16_t *s, int p, const uint8_t *blimit,
+                const uint8_t *limit, const uint8_t *thresh,
+                int /*count*/, int bd) {
+  fn(s, p, blimit, limit, thresh, bd);
+}
 #else
 typedef void (*loop_op_t)(uint8_t *s, int p, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh,
@@ -50,107 +61,21 @@ typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1);
+
+// wrapper for loopfilter functions without a 'count' param.
+typedef void (*loop_op_nc_t)(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh);
+template <loop_op_nc_t fn>
+void wrapper_nc(uint8_t *s, int p, const uint8_t *blimit,
+                const uint8_t *limit, const uint8_t *thresh,
+                int /*count*/) {
+  fn(s, p, blimit, limit, thresh);
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 typedef std::tr1::tuple<loop_op_t, loop_op_t, int, int> loop8_param_t;
 typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
 
-#if HAVE_SSE2
-#if CONFIG_VP9_HIGHBITDEPTH
-void wrapper_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                              const uint8_t *limit, const uint8_t *thresh,
-                              int count, int bd) {
-  vpx_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);
-}
-
-void wrapper_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh,
-                           int count, int bd) {
-  vpx_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);
-}
-
-void wrapper_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int count, int bd) {
-  vpx_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);
-}
-
-void wrapper_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit,
-                                const uint8_t *limit, const uint8_t *thresh,
-                                int count, int bd) {
-  vpx_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);
-}
-#else
-void wrapper_vertical_16_sse2(uint8_t *s, int p, const uint8_t *blimit,
-                              const uint8_t *limit, const uint8_t *thresh,
-                              int count) {
-  vpx_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh,
-                           int count) {
-  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_dual_sse2(uint8_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int count) {
-  vpx_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
-                                const uint8_t *limit, const uint8_t *thresh,
-                                int count) {
-  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // HAVE_SSE2
-
-#if HAVE_NEON_ASM
-#if CONFIG_VP9_HIGHBITDEPTH
-// No neon high bitdepth functions.
-#else
-void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
-                              const uint8_t *limit, const uint8_t *thresh,
-                              int count) {
-  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh,
-                           int count) {
-  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int count) {
-  vpx_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
-                                const uint8_t *limit, const uint8_t *thresh,
-                                int count) {
-  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // HAVE_NEON_ASM
-
-#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
-void wrapper_vertical_16_msa(uint8_t *s, int p, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh,
-                             int count) {
-  vpx_lpf_vertical_16_msa(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh,
-                           int count) {
-  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
-}
-#endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
-
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
  public:
   virtual ~Loop8Test6Param() {}
@@ -546,8 +471,8 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_highbd_lpf_horizontal_16_c, 8, 2),
         make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
                    &vpx_highbd_lpf_vertical_8_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_vertical_16_c>, 8, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
                    &vpx_highbd_lpf_horizontal_4_c, 10, 1),
         make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
@@ -560,8 +485,8 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_highbd_lpf_horizontal_16_c, 10, 2),
         make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
                    &vpx_highbd_lpf_vertical_8_c, 10, 1),
-        make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 10, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_vertical_16_c>, 10, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
                    &vpx_highbd_lpf_horizontal_4_c, 12, 1),
         make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
@@ -574,14 +499,14 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_highbd_lpf_horizontal_16_c, 12, 2),
         make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
                    &vpx_highbd_lpf_vertical_8_c, 12, 1),
-        make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 12, 1),
-        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 10, 1),
-        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 12, 1)));
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_vertical_16_c>, 12, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_dual_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_vertical_16_dual_c>, 8, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_dual_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_vertical_16_dual_c>, 10, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_dual_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_vertical_16_dual_c>, 12, 1)));
 #else
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
@@ -590,9 +515,10 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),
         make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 8, 1)));
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_sse2>,
+                   &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dual_sse2>,
+                   &wrapper_nc<vpx_lpf_vertical_16_dual_c>, 8, 1)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
 
@@ -663,10 +589,10 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_neon,
                    &vpx_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&wrapper_vertical_16_neon,
-                   &wrapper_vertical_16_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_dual_neon,
-                   &wrapper_vertical_16_dual_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_neon>,
+                   &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dual_neon>,
+                   &wrapper_nc<vpx_lpf_vertical_16_dual_c>, 8, 1),
 #endif  // HAVE_NEON_ASM
         make_tuple(&vpx_lpf_horizontal_8_neon,
                    &vpx_lpf_horizontal_8_c, 8, 1),
@@ -700,7 +626,8 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),
         make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_msa, &wrapper_vertical_16_c, 8, 1)));
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_msa>,
+                   &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1)));
 
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test9Param,

From c3f2c8ad2a00453b63cf3fab89968ad10d9d616b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 12 Feb 2016 20:23:41 -0800
Subject: [PATCH 02/16] lpf_8_test: add missing vpx_lpf_vertical_4 tests

mmx, msa

Change-Id: I113ce0ec144ee673d5dcde4c03fe7670f9f4c369
---
 test/lpf_8_test.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index c582bc34d..45cd6618c 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -454,6 +454,13 @@ TEST_P(Loop8Test9Param, ValueCheck) {
 
 using std::tr1::make_tuple;
 
+#if HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    MMX, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_vertical_4_mmx, &vpx_lpf_vertical_4_c, 8, 1)));
+#endif  // HAVE_MMX
+
 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
@@ -625,6 +632,7 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_vertical_4_msa, &vpx_lpf_vertical_4_c, 8, 1),
         make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_msa>,
                    &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1)));

From 4fec4a8e288cebeb265996e54d2cb4cd000bb38b Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 12 Feb 2016 20:25:15 -0800
Subject: [PATCH 03/16] lpf_8_test: add missing vpx_lpf_horizontal_4 tests

mmx, msa

Change-Id: Ia9604adcdcc77411f383e081e01a18d232c9d992
---
 test/lpf_8_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 45cd6618c..58bcb05d1 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -458,6 +458,7 @@ using std::tr1::make_tuple;
 INSTANTIATE_TEST_CASE_P(
     MMX, Loop8Test6Param,
     ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_mmx, &vpx_lpf_horizontal_4_c, 8, 1),
         make_tuple(&vpx_lpf_vertical_4_mmx, &vpx_lpf_vertical_4_c, 8, 1)));
 #endif  // HAVE_MMX
 
@@ -629,6 +630,7 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test6Param,
     ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_msa, &vpx_lpf_horizontal_4_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),

From 47dee375db30e803a9ff8c0694040b1eb819add7 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 13 Feb 2016 10:24:26 -0800
Subject: [PATCH 04/16] lpf_8_test: add missing dspr2 tests

Change-Id: I3954ff86ec1965cd6d4eec570c2d1993538d9c11
---
 test/lpf_8_test.cc | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 58bcb05d1..3f73a996f 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -626,6 +626,36 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
+#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    DSPR2, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_dspr2, &vpx_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_8_dspr2, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_dspr2,
+                   &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_dspr2,
+                   &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_vertical_4_dspr2, &vpx_lpf_vertical_4_c, 8, 1),
+        make_tuple(&vpx_lpf_vertical_8_dspr2, &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dspr2>,
+                   &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dual_dspr2>,
+                   &wrapper_nc<vpx_lpf_vertical_16_dual_c>, 8, 1)));
+
+INSTANTIATE_TEST_CASE_P(
+    DSPR2, Loop8Test9Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_dual_dspr2,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_dspr2,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_dspr2,
+                   &vpx_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_dspr2,
+                   &vpx_lpf_vertical_8_dual_c, 8)));
+#endif  // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
+
 #if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test6Param,

From 37225744dbf30d79711fa9ef182d2007a51b11bd Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 11 Feb 2016 19:43:36 -0800
Subject: [PATCH 05/16] vpx_lpf_vertical_8: remove unused count param

Change-Id: Ic69406da00afb0f06588e8c0deb2b043952b078c
---
 test/lpf_8_test.cc                      | 13 ++++++++-----
 vp10/common/loopfilter.c                |  7 +++----
 vp9/common/vp9_loopfilter.c             |  7 +++----
 vpx_dsp/arm/loopfilter_8_neon.asm       | 14 +-------------
 vpx_dsp/arm/loopfilter_8_neon.c         |  8 ++------
 vpx_dsp/arm/loopfilter_neon.c           |  4 ++--
 vpx_dsp/loopfilter.c                    | 10 ++++------
 vpx_dsp/mips/loopfilter_8_msa.c         |  5 +----
 vpx_dsp/mips/loopfilter_filters_dspr2.c |  5 ++---
 vpx_dsp/mips/loopfilter_mb_dspr2.c      |  3 +--
 vpx_dsp/vpx_dsp_rtcd_defs.pl            |  2 +-
 vpx_dsp/x86/loopfilter_sse2.c           |  3 +--
 12 files changed, 29 insertions(+), 52 deletions(-)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 3f73a996f..9697c88b9 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -522,7 +522,8 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_8_sse2>,
+                   &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_sse2>,
                    &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dual_sse2>,
@@ -604,8 +605,8 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_NEON_ASM
         make_tuple(&vpx_lpf_horizontal_8_neon,
                    &vpx_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vpx_lpf_vertical_8_neon,
-                   &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_8_neon>,
+                   &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
         make_tuple(&vpx_lpf_horizontal_4_neon,
                    &vpx_lpf_horizontal_4_c, 8, 1),
         make_tuple(&vpx_lpf_vertical_4_neon,
@@ -637,7 +638,8 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_lpf_horizontal_16_dspr2,
                    &vpx_lpf_horizontal_16_c, 8, 2),
         make_tuple(&vpx_lpf_vertical_4_dspr2, &vpx_lpf_vertical_4_c, 8, 1),
-        make_tuple(&vpx_lpf_vertical_8_dspr2, &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_8_dspr2>,
+                   &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dspr2>,
                    &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dual_dspr2>,
@@ -665,7 +667,8 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),
         make_tuple(&vpx_lpf_vertical_4_msa, &vpx_lpf_vertical_4_c, 8, 1),
-        make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_8_msa>,
+                   &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_msa>,
                    &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1)));
 
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index a1925de55..4171c1e08 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -345,11 +345,10 @@ static void filter_selectively_vert_row2(int subsampling_factor,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_8x8_0 & 1) {
-          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                             1);
+          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
         } else {
           vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr, 1);
+                             lfi1->hev_thr);
         }
       }
 
@@ -1127,7 +1126,7 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
       if (mask_16x16 & 1) {
         vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       } else if (mask_8x8 & 1) {
-        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       } else if (mask_4x4 & 1) {
         vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
       }
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 79c3c4820..8c281c2ec 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -345,11 +345,10 @@ static void filter_selectively_vert_row2(int subsampling_factor,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_8x8_0 & 1) {
-          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                             1);
+          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
         } else {
           vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr, 1);
+                             lfi1->hev_thr);
         }
       }
 
@@ -1102,7 +1101,7 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
       if (mask_16x16 & 1) {
         vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       } else if (mask_8x8 & 1) {
-        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       } else if (mask_4x4 & 1) {
         vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
       }
diff --git a/vpx_dsp/arm/loopfilter_8_neon.asm b/vpx_dsp/arm/loopfilter_8_neon.asm
index e81734c04..61cabe8e8 100644
--- a/vpx_dsp/arm/loopfilter_8_neon.asm
+++ b/vpx_dsp/arm/loopfilter_8_neon.asm
@@ -82,30 +82,24 @@ end_vpx_mblf_h_edge
 ;                              int pitch,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
-;                              const uint8_t *thresh,
-;                              int count)
+;                              const uint8_t *thresh)
 ;
 ; r0    uint8_t *s,
 ; r1    int pitch,
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-; sp+4  int count
 |vpx_lpf_vertical_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    ldr         r12, [sp, #16]            ; load count
     vld1.8      {d1[]}, [r3]              ; duplicate *limit
 
     ldr         r3, [sp, #12]             ; load thresh
     sub         r2, r0, #4                ; move s pointer down by 4 columns
-    cmp         r12, #0
-    beq         end_vpx_mblf_v_edge
 
     vld1.8      {d2[]}, [r3]              ; duplicate *thresh
 
-count_mblf_v_loop
     vld1.u8     {d3}, [r2], r1             ; load s data
     vld1.u8     {d4}, [r2], r1
     vld1.u8     {d5}, [r2], r1
@@ -156,12 +150,6 @@ count_mblf_v_loop
     vst2.8      {d4[6], d5[6]}, [r3], r1
     vst2.8      {d4[7], d5[7]}, [r3]
 
-    add         r0, r0, r1, lsl #3         ; s += pitch * 8
-    subs        r12, r12, #1
-    subne       r2, r0, #4                 ; move s pointer down by 4 columns
-    bne         count_mblf_v_loop
-
-end_vpx_mblf_v_edge
     pop         {r4-r5, pc}
     ENDP        ; |vpx_lpf_vertical_8_neon|
 
diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c
index a887e2ee5..3c005700f 100644
--- a/vpx_dsp/arm/loopfilter_8_neon.c
+++ b/vpx_dsp/arm/loopfilter_8_neon.c
@@ -328,8 +328,7 @@ void vpx_lpf_vertical_8_neon(
         int pitch,
         const uint8_t *blimit,
         const uint8_t *limit,
-        const uint8_t *thresh,
-        int count) {
+        const uint8_t *thresh) {
     int i;
     uint8_t *s;
     uint8x8_t dblimit, dlimit, dthresh;
@@ -341,14 +340,11 @@ void vpx_lpf_vertical_8_neon(
     uint8x8x4_t d4Result;
     uint8x8x2_t d2Result;
 
-    if (count == 0)
-        return;
-
     dblimit = vld1_u8(blimit);
     dlimit = vld1_u8(limit);
     dthresh = vld1_u8(thresh);
 
-    for (i = 0; i < count; i++) {
+    for (i = 0; i < 1; i++) {
         s = src + (i * (pitch << 3)) - 4;
 
         d3u8 = vld1_u8(s);
diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c
index eff87d29b..581410541 100644
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@@ -44,8 +44,8 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 66f4d9576..fdb5dbbab 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -218,11 +218,10 @@ void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
 }
 
 void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh,
-                          int count) {
+                          const uint8_t *limit, const uint8_t *thresh) {
   int i;
 
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     const int8_t mask = filter_mask(*limit, *blimit,
@@ -238,9 +237,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
-  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
-  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
-                                    thresh1, 1);
+  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
 static INLINE void filter16(int8_t mask, uint8_t thresh,
diff --git a/vpx_dsp/mips/loopfilter_8_msa.c b/vpx_dsp/mips/loopfilter_8_msa.c
index 00b6db550..ec3f5dd22 100644
--- a/vpx_dsp/mips/loopfilter_8_msa.c
+++ b/vpx_dsp/mips/loopfilter_8_msa.c
@@ -161,8 +161,7 @@ void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
 void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
                             const uint8_t *b_limit_ptr,
                             const uint8_t *limit_ptr,
-                            const uint8_t *thresh_ptr,
-                            int32_t count) {
+                            const uint8_t *thresh_ptr) {
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v16u8 p1_out, p0_out, q0_out, q1_out;
   v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -171,8 +170,6 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
   v16u8 zero = { 0 };
   v8i16 vec0, vec1, vec2, vec3, vec4;
 
-  (void)count;
-
   /* load vector elements */
   LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
 
diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c
index 99a96d89b..529df4ee6 100644
--- a/vpx_dsp/mips/loopfilter_filters_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c
@@ -346,9 +346,8 @@ void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
                                    const uint8_t *blimit1,
                                    const uint8_t *limit1,
                                    const uint8_t *thresh1) {
-  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
-                                       1);
+  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
diff --git a/vpx_dsp/mips/loopfilter_mb_dspr2.c b/vpx_dsp/mips/loopfilter_mb_dspr2.c
index 4138f5697..5bbf091c8 100644
--- a/vpx_dsp/mips/loopfilter_mb_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_dspr2.c
@@ -322,8 +322,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
                               int pitch,
                               const uint8_t *blimit,
                               const uint8_t *limit,
-                              const uint8_t *thresh,
-                              int count) {
+                              const uint8_t *thresh) {
   uint8_t   i;
   uint32_t  mask, hev, flat;
   uint8_t   *s1, *s2, *s3, *s4;
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 73726d217..feaf0ae7e 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -535,7 +535,7 @@ add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8
 specialize qw/vpx_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
 $vpx_lpf_vertical_16_dual_neon_asm=vpx_lpf_vertical_16_dual_neon;
 
-add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
index ed1012736..086d075fa 100644
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -1492,11 +1492,10 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
 void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
                              const unsigned char *blimit,
                              const unsigned char *limit,
-                             const unsigned char *thresh, int count) {
+                             const unsigned char *thresh) {
   DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
   unsigned char *src[1];
   unsigned char *dst[1];
-  (void)count;
 
   // Transpose 8x8
   src[0] = s - 4;

From 109a47b3426d302df201295aeff9cf0e40badf69 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 11 Feb 2016 19:54:51 -0800
Subject: [PATCH 06/16] vpx_lpf_vertical_4: remove unused count param

Change-Id: I43a191cb3d42e51e7bca266adfa11c6239a8064c
---
 test/lpf_8_test.cc                      | 13 ++++++++-----
 vp10/common/loopfilter.c                | 13 ++++++-------
 vp9/common/vp9_loopfilter.c             | 13 ++++++-------
 vpx_dsp/arm/loopfilter_4_neon.asm       | 16 +---------------
 vpx_dsp/arm/loopfilter_4_neon.c         |  8 ++------
 vpx_dsp/arm/loopfilter_neon.c           |  4 ++--
 vpx_dsp/loopfilter.c                    | 10 ++++------
 vpx_dsp/mips/loopfilter_4_msa.c         |  5 +----
 vpx_dsp/mips/loopfilter_filters_dspr2.c |  7 +++----
 vpx_dsp/vpx_dsp_rtcd_defs.pl            |  2 +-
 vpx_dsp/x86/loopfilter_mmx.asm          | 11 ++---------
 11 files changed, 36 insertions(+), 66 deletions(-)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 9697c88b9..5c83f3a1f 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -459,7 +459,8 @@ INSTANTIATE_TEST_CASE_P(
     MMX, Loop8Test6Param,
     ::testing::Values(
         make_tuple(&vpx_lpf_horizontal_4_mmx, &vpx_lpf_horizontal_4_c, 8, 1),
-        make_tuple(&vpx_lpf_vertical_4_mmx, &vpx_lpf_vertical_4_c, 8, 1)));
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_4_mmx>,
+                   &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1)));
 #endif  // HAVE_MMX
 
 #if HAVE_SSE2
@@ -609,8 +610,8 @@ INSTANTIATE_TEST_CASE_P(
                    &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
         make_tuple(&vpx_lpf_horizontal_4_neon,
                    &vpx_lpf_horizontal_4_c, 8, 1),
-        make_tuple(&vpx_lpf_vertical_4_neon,
-                   &vpx_lpf_vertical_4_c, 8, 1)));
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_4_neon>,
+                   &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1)));
 INSTANTIATE_TEST_CASE_P(
     NEON, Loop8Test9Param,
     ::testing::Values(
@@ -637,7 +638,8 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_dspr2,
                    &vpx_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vpx_lpf_vertical_4_dspr2, &vpx_lpf_vertical_4_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_4_dspr2>,
+                   &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_8_dspr2>,
                    &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dspr2>,
@@ -666,7 +668,8 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vpx_lpf_vertical_4_msa, &vpx_lpf_vertical_4_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_vertical_4_msa>,
+                   &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_8_msa>,
                    &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_msa>,
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 4171c1e08..9f55dc248 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -358,11 +358,10 @@ static void filter_selectively_vert_row2(int subsampling_factor,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_4x4_0 & 1) {
-          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                             1);
+          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
         } else {
           vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr, 1);
+                             lfi1->hev_thr);
         }
       }
 
@@ -373,10 +372,10 @@ static void filter_selectively_vert_row2(int subsampling_factor,
                                   lfi1->hev_thr);
         } else if (mask_4x4_int_0 & 1) {
           vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                             lfi0->hev_thr, 1);
+                             lfi0->hev_thr);
         } else {
           vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr, 1);
+                             lfi1->hev_thr);
         }
       }
     }
@@ -1128,11 +1127,11 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
       } else if (mask_8x8 & 1) {
         vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       } else if (mask_4x4 & 1) {
-        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       }
     }
     if (mask_4x4_int & 1)
-      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
     s += 8;
     lfl += 1;
     mask_16x16 >>= 1;
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 8c281c2ec..e892f78d0 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -358,11 +358,10 @@ static void filter_selectively_vert_row2(int subsampling_factor,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_4x4_0 & 1) {
-          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                             1);
+          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
         } else {
           vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr, 1);
+                             lfi1->hev_thr);
         }
       }
 
@@ -373,10 +372,10 @@ static void filter_selectively_vert_row2(int subsampling_factor,
                                   lfi1->hev_thr);
         } else if (mask_4x4_int_0 & 1) {
           vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                             lfi0->hev_thr, 1);
+                             lfi0->hev_thr);
         } else {
           vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr, 1);
+                             lfi1->hev_thr);
         }
       }
     }
@@ -1103,11 +1102,11 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
       } else if (mask_8x8 & 1) {
         vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       } else if (mask_4x4 & 1) {
-        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       }
     }
     if (mask_4x4_int & 1)
-      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
     s += 8;
     lfl += 1;
     mask_16x16 >>= 1;
diff --git a/vpx_dsp/arm/loopfilter_4_neon.asm b/vpx_dsp/arm/loopfilter_4_neon.asm
index e45e34cd4..d794f552a 100644
--- a/vpx_dsp/arm/loopfilter_4_neon.asm
+++ b/vpx_dsp/arm/loopfilter_4_neon.asm
@@ -79,37 +79,29 @@ end_vpx_lf_h_edge
 
 ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
 ;
 ; void vpx_lpf_vertical_4_neon(uint8_t *s,
 ;                              int p /* pitch */,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
-;                              const uint8_t *thresh,
-;                              int count)
+;                              const uint8_t *thresh)
 ;
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-; sp+4  int count
 |vpx_lpf_vertical_4_neon| PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    ldr         r12, [sp, #8]             ; load count
     vld1.8      {d1[]}, [r3]              ; duplicate *limit
 
     ldr         r3, [sp, #4]              ; load thresh
     sub         r2, r0, #4                ; move s pointer down by 4 columns
-    cmp         r12, #0
-    beq         end_vpx_lf_v_edge
 
     vld1.8      {d2[]}, [r3]              ; duplicate *thresh
 
-count_lf_v_loop
     vld1.u8     {d3}, [r2], r1             ; load s data
     vld1.u8     {d4}, [r2], r1
     vld1.u8     {d5}, [r2], r1
@@ -149,12 +141,6 @@ count_lf_v_loop
     vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
     vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
 
-    add         r0, r0, r1, lsl #3         ; s += pitch * 8
-    subs        r12, r12, #1
-    subne       r2, r0, #4                 ; move s pointer down by 4 columns
-    bne         count_lf_v_loop
-
-end_vpx_lf_v_edge
     pop         {pc}
     ENDP        ; |vpx_lpf_vertical_4_neon|
 
diff --git a/vpx_dsp/arm/loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c
index 7ad411aea..db9ea6a9d 100644
--- a/vpx_dsp/arm/loopfilter_4_neon.c
+++ b/vpx_dsp/arm/loopfilter_4_neon.c
@@ -170,8 +170,7 @@ void vpx_lpf_vertical_4_neon(
         int pitch,
         const uint8_t *blimit,
         const uint8_t *limit,
-        const uint8_t *thresh,
-        int count) {
+        const uint8_t *thresh) {
     int i, pitch8;
     uint8_t *s;
     uint8x8_t dblimit, dlimit, dthresh;
@@ -181,15 +180,12 @@ void vpx_lpf_vertical_4_neon(
     uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
     uint8x8x4_t d4Result;
 
-    if (count == 0)  // end_vpx_lf_h_edge
-        return;
-
     dblimit = vld1_u8(blimit);
     dlimit = vld1_u8(limit);
     dthresh = vld1_u8(thresh);
 
     pitch8 = pitch * 8;
-    for (i = 0; i < count; i++, src += pitch8) {
+    for (i = 0; i < 1; i++, src += pitch8) {
         s = src - (i + 1) * 4;
 
         d3u8 = vld1_u8(s);
diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c
index 581410541..b01944ebb 100644
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@@ -21,8 +21,8 @@ void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 
 #if HAVE_NEON_ASM
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index fdb5dbbab..1604fdbcf 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -143,13 +143,12 @@ void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
 }
 
 void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh,
-                          int count) {
+                          const uint8_t *limit, const uint8_t *thresh) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
     const int8_t mask = filter_mask(*limit, *blimit,
@@ -163,9 +162,8 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
-  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
-  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
-                                  thresh1, 1);
+  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
 static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
diff --git a/vpx_dsp/mips/loopfilter_4_msa.c b/vpx_dsp/mips/loopfilter_4_msa.c
index daf5f38bf..ebeaddd21 100644
--- a/vpx_dsp/mips/loopfilter_4_msa.c
+++ b/vpx_dsp/mips/loopfilter_4_msa.c
@@ -74,14 +74,11 @@ void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
 void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
                             const uint8_t *b_limit_ptr,
                             const uint8_t *limit_ptr,
-                            const uint8_t *thresh_ptr,
-                            int32_t count) {
+                            const uint8_t *thresh_ptr) {
   v16u8 mask, hev, flat, limit, thresh, b_limit;
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v8i16 vec0, vec1, vec2, vec3;
 
-  (void)count;
-
   LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
 
   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c
index 529df4ee6..9924982f1 100644
--- a/vpx_dsp/mips/loopfilter_filters_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c
@@ -117,8 +117,7 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
                               int pitch,
                               const uint8_t *blimit,
                               const uint8_t *limit,
-                              const uint8_t *thresh,
-                              int count) {
+                              const uint8_t *thresh) {
   uint8_t   i;
   uint32_t  mask, hev;
   uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
@@ -335,8 +334,8 @@ void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
                                    const uint8_t *blimit1,
                                    const uint8_t *limit1,
                                    const uint8_t *thresh1) {
-  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index feaf0ae7e..eeb03b671 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -542,7 +542,7 @@ add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_
 specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
 $vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
 
-add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_vertical_4 mmx neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
diff --git a/vpx_dsp/x86/loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm
index b9c18b680..dee565ce0 100644
--- a/vpx_dsp/x86/loopfilter_mmx.asm
+++ b/vpx_dsp/x86/loopfilter_mmx.asm
@@ -230,14 +230,13 @@ sym(vpx_lpf_horizontal_4_mmx):
 ;    int  src_pixel_step,
 ;    const char *blimit,
 ;    const char *limit,
-;    const char *thresh,
-;    int count
+;    const char *thresh
 ;)
 global sym(vpx_lpf_vertical_4_mmx) PRIVATE
 sym(vpx_lpf_vertical_4_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
+    SHADOW_ARGS_TO_STACK 5
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -254,8 +253,6 @@ sym(vpx_lpf_vertical_4_mmx):
 
         lea         rsi,        [rsi + rax*4 - 4]
 
-        movsxd      rcx,        dword ptr arg(5) ;count
-.next8_v:
         mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
         add         rdi,        rax
 
@@ -579,10 +576,6 @@ sym(vpx_lpf_vertical_4_mmx):
 
         movd        [rdi+rax*2+2], mm5
 
-        lea         rsi,        [rsi+rax*8]
-        dec         rcx
-        jnz         .next8_v
-
     add rsp, 64
     pop rsp
     ; begin epilog

From bd5a5bb561845a6d6a2a0295d7681be09a66ec48 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 11 Feb 2016 20:02:53 -0800
Subject: [PATCH 07/16] vpx_lpf_horizontal_8: remove unused count param

Change-Id: I48741e167a7b09b7c9ad3bfc1c4b88ef1029ae46
---
 test/lpf_8_test.cc                      | 13 ++++++++-----
 vp10/common/loopfilter.c                |  2 +-
 vp9/common/vp9_loopfilter.c             |  2 +-
 vpx_dsp/arm/loopfilter_8_neon.asm       | 16 +---------------
 vpx_dsp/arm/loopfilter_8_neon.c         |  8 ++------
 vpx_dsp/arm/loopfilter_neon.c           |  4 ++--
 vpx_dsp/loopfilter.c                    |  9 ++++-----
 vpx_dsp/mips/loopfilter_8_msa.c         |  5 +----
 vpx_dsp/mips/loopfilter_filters_dspr2.c |  4 ++--
 vpx_dsp/mips/loopfilter_mb_dspr2.c      |  3 +--
 vpx_dsp/vpx_dsp_rtcd_defs.pl            |  2 +-
 vpx_dsp/x86/loopfilter_sse2.c           |  6 ++----
 12 files changed, 26 insertions(+), 48 deletions(-)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 5c83f3a1f..394360e2f 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -520,7 +520,8 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_sse2>,
+                   &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_8_sse2>,
@@ -604,8 +605,8 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dual_neon>,
                    &wrapper_nc<vpx_lpf_vertical_16_dual_c>, 8, 1),
 #endif  // HAVE_NEON_ASM
-        make_tuple(&vpx_lpf_horizontal_8_neon,
-                   &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_neon>,
+                   &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_8_neon>,
                    &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
         make_tuple(&vpx_lpf_horizontal_4_neon,
@@ -633,7 +634,8 @@ INSTANTIATE_TEST_CASE_P(
     DSPR2, Loop8Test6Param,
     ::testing::Values(
         make_tuple(&vpx_lpf_horizontal_4_dspr2, &vpx_lpf_horizontal_4_c, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_8_dspr2, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_dspr2>,
+                   &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_dspr2,
                    &vpx_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_dspr2,
@@ -665,7 +667,8 @@ INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test6Param,
     ::testing::Values(
         make_tuple(&vpx_lpf_horizontal_4_msa, &vpx_lpf_horizontal_4_c, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_msa>,
+                   &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_4_msa>,
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 9f55dc248..6bbf191ac 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -542,7 +542,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
           }
           count = 2;
         } else {
-          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
 
           if (mask_4x4_int & 1)
             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index e892f78d0..d5431c2c2 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -542,7 +542,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
           }
           count = 2;
         } else {
-          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
 
           if (mask_4x4_int & 1)
             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
diff --git a/vpx_dsp/arm/loopfilter_8_neon.asm b/vpx_dsp/arm/loopfilter_8_neon.asm
index 61cabe8e8..a2f20e15f 100644
--- a/vpx_dsp/arm/loopfilter_8_neon.asm
+++ b/vpx_dsp/arm/loopfilter_8_neon.asm
@@ -16,35 +16,26 @@
 
 ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
 ;
 ; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
-;                                const uint8_t *thresh,
-;                                int count)
+;                                const uint8_t *thresh)
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-; sp+4  int count
 |vpx_lpf_horizontal_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r12, [sp, #16]             ; load count
     ldr         r2, [sp, #12]              ; load thresh
     add         r1, r1, r1                 ; double pitch
 
-    cmp         r12, #0
-    beq         end_vpx_mblf_h_edge
-
     vld1.8      {d1[]}, [r3]               ; duplicate *limit
     vld1.8      {d2[]}, [r2]               ; duplicate *thresh
 
-count_mblf_h_loop
     sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines
     add         r2, r3, r1, lsr #1         ; set to 3 lines down
 
@@ -69,11 +60,6 @@ count_mblf_h_loop
     vst1.u8     {d4}, [r2@64], r1          ; store oq1
     vst1.u8     {d5}, [r3@64], r1          ; store oq2
 
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         count_mblf_h_loop
-
-end_vpx_mblf_h_edge
     pop         {r4-r5, pc}
 
     ENDP        ; |vpx_lpf_horizontal_8_neon|
diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c
index 3c005700f..ec3757380 100644
--- a/vpx_dsp/arm/loopfilter_8_neon.c
+++ b/vpx_dsp/arm/loopfilter_8_neon.c
@@ -268,23 +268,19 @@ void vpx_lpf_horizontal_8_neon(
         int pitch,
         const uint8_t *blimit,
         const uint8_t *limit,
-        const uint8_t *thresh,
-        int count) {
+        const uint8_t *thresh) {
     int i;
     uint8_t *s, *psrc;
     uint8x8_t dblimit, dlimit, dthresh;
     uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
     uint8x8_t d16u8, d17u8, d18u8;
 
-    if (count == 0)  // end_vpx_mblf_h_edge
-        return;
-
     dblimit = vld1_u8(blimit);
     dlimit = vld1_u8(limit);
     dthresh = vld1_u8(thresh);
 
     psrc = src - (pitch << 2);
-    for (i = 0; i < count; i++) {
+    for (i = 0; i < 1; i++) {
         s = psrc + i * 8;
 
         d3u8  = vld1_u8(s);
diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c
index b01944ebb..aa31f2935 100644
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@@ -33,8 +33,8 @@ void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
                                     const uint8_t *blimit1,
                                     const uint8_t *limit1,
                                     const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 1604fdbcf..e8092d912 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -188,13 +188,12 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
 }
 
 void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
-                            const uint8_t *limit, const uint8_t *thresh,
-                            int count) {
+                            const uint8_t *limit, const uint8_t *thresh) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 
@@ -211,8 +210,8 @@ void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
diff --git a/vpx_dsp/mips/loopfilter_8_msa.c b/vpx_dsp/mips/loopfilter_8_msa.c
index ec3f5dd22..5b22bd002 100644
--- a/vpx_dsp/mips/loopfilter_8_msa.c
+++ b/vpx_dsp/mips/loopfilter_8_msa.c
@@ -13,8 +13,7 @@
 void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
                               const uint8_t *b_limit_ptr,
                               const uint8_t *limit_ptr,
-                              const uint8_t *thresh_ptr,
-                              int32_t count) {
+                              const uint8_t *thresh_ptr) {
   uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
   v16u8 mask, hev, flat, thresh, b_limit, limit;
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
@@ -23,8 +22,6 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
   v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
   v16i8 zero = { 0 };
 
-  (void)count;
-
   /* load vector elements */
   LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
 
diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c
index 9924982f1..8a24372cb 100644
--- a/vpx_dsp/mips/loopfilter_filters_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c
@@ -323,8 +323,8 @@ void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
diff --git a/vpx_dsp/mips/loopfilter_mb_dspr2.c b/vpx_dsp/mips/loopfilter_mb_dspr2.c
index 5bbf091c8..dd0545eed 100644
--- a/vpx_dsp/mips/loopfilter_mb_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_dspr2.c
@@ -23,8 +23,7 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
                                 int pitch,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
-                                const uint8_t *thresh,
-                                int count) {
+                                const uint8_t *thresh) {
   uint32_t  mask;
   uint32_t  hev, flat;
   uint8_t   i;
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index eeb03b671..3f63a5f62 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -552,7 +552,7 @@ add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t
 specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/;
 $vpx_lpf_horizontal_16_neon_asm=vpx_lpf_horizontal_16_neon;
 
-add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
index 086d075fa..e1236dc4d 100644
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -730,7 +730,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
 void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
                                const unsigned char *_blimit,
                                const unsigned char *_limit,
-                               const unsigned char *_thresh, int count) {
+                               const unsigned char *_thresh) {
   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
@@ -745,8 +745,6 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
 
-  (void)count;
-
   q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));
   q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
@@ -1504,7 +1502,7 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
   transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
+  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
 
   src[0] = t_dst;
   dst[0] = s - 4;

From b1e97c6a25d53fb2e62e2fb857fbf146bb19cbd3 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 11 Feb 2016 20:26:54 -0800
Subject: [PATCH 08/16] vpx_lpf_horizontal_4: remove unused count param

Change-Id: Iec7d8eda343991f7d7d46931dca17af23c821d11
---
 test/lpf_8_test.cc                      | 13 ++++++++-----
 vp10/common/loopfilter.c                | 16 ++++++++--------
 vp9/common/vp9_loopfilter.c             | 16 ++++++++--------
 vpx_dsp/arm/loopfilter_4_neon.asm       | 16 +---------------
 vpx_dsp/arm/loopfilter_4_neon.c         |  8 ++------
 vpx_dsp/loopfilter.c                    |  8 ++++----
 vpx_dsp/mips/loopfilter_4_msa.c         |  5 +----
 vpx_dsp/mips/loopfilter_filters_dspr2.c |  7 +++----
 vpx_dsp/vpx_dsp_rtcd_defs.pl            |  2 +-
 vpx_dsp/x86/loopfilter_mmx.asm          | 12 ++----------
 10 files changed, 38 insertions(+), 65 deletions(-)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 394360e2f..e6fe1e508 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -458,7 +458,8 @@ using std::tr1::make_tuple;
 INSTANTIATE_TEST_CASE_P(
     MMX, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vpx_lpf_horizontal_4_mmx, &vpx_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_mmx>,
+                   &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_4_mmx>,
                    &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1)));
 #endif  // HAVE_MMX
@@ -609,8 +610,8 @@ INSTANTIATE_TEST_CASE_P(
                    &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_8_neon>,
                    &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_4_neon,
-                   &vpx_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_neon>,
+                   &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_4_neon>,
                    &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1)));
 INSTANTIATE_TEST_CASE_P(
@@ -633,7 +634,8 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     DSPR2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vpx_lpf_horizontal_4_dspr2, &vpx_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_dspr2>,
+                   &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_dspr2>,
                    &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_dspr2,
@@ -666,7 +668,8 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vpx_lpf_horizontal_4_msa, &vpx_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_msa>,
+                   &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_msa>,
                    &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 6bbf191ac..1f7ce981f 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -535,10 +535,10 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
           } else {
             if (mask_4x4_int & 1)
               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                   lfi->hev_thr, 1);
+                                   lfi->hev_thr);
             else if (mask_4x4_int & 2)
               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                   lfin->lim, lfin->hev_thr, 1);
+                                   lfin->lim, lfin->hev_thr);
           }
           count = 2;
         } else {
@@ -546,7 +546,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
 
           if (mask_4x4_int & 1)
             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                 lfi->hev_thr, 1);
+                                 lfi->hev_thr);
         }
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & 3) == 3) {
@@ -563,22 +563,22 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
           } else {
             if (mask_4x4_int & 1)
               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                   lfi->hev_thr, 1);
+                                   lfi->hev_thr);
             else if (mask_4x4_int & 2)
               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                   lfin->lim, lfin->hev_thr, 1);
+                                   lfin->lim, lfin->hev_thr);
           }
           count = 2;
         } else {
-          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
 
           if (mask_4x4_int & 1)
             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                 lfi->hev_thr, 1);
+                                 lfi->hev_thr);
         }
       } else if (mask_4x4_int & 1) {
         vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                             lfi->hev_thr, 1);
+                             lfi->hev_thr);
       }
     }
     s += 8 * count;
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index d5431c2c2..7cc833e19 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -535,10 +535,10 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
           } else {
             if (mask_4x4_int & 1)
               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                   lfi->hev_thr, 1);
+                                   lfi->hev_thr);
             else if (mask_4x4_int & 2)
               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                   lfin->lim, lfin->hev_thr, 1);
+                                   lfin->lim, lfin->hev_thr);
           }
           count = 2;
         } else {
@@ -546,7 +546,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
 
           if (mask_4x4_int & 1)
             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                 lfi->hev_thr, 1);
+                                 lfi->hev_thr);
         }
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & 3) == 3) {
@@ -563,22 +563,22 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
           } else {
             if (mask_4x4_int & 1)
               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                   lfi->hev_thr, 1);
+                                   lfi->hev_thr);
             else if (mask_4x4_int & 2)
               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                   lfin->lim, lfin->hev_thr, 1);
+                                   lfin->lim, lfin->hev_thr);
           }
           count = 2;
         } else {
-          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
 
           if (mask_4x4_int & 1)
             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                 lfi->hev_thr, 1);
+                                 lfi->hev_thr);
         }
       } else if (mask_4x4_int & 1) {
         vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                             lfi->hev_thr, 1);
+                             lfi->hev_thr);
       }
     }
     s += 8 * count;
diff --git a/vpx_dsp/arm/loopfilter_4_neon.asm b/vpx_dsp/arm/loopfilter_4_neon.asm
index d794f552a..937115898 100644
--- a/vpx_dsp/arm/loopfilter_4_neon.asm
+++ b/vpx_dsp/arm/loopfilter_4_neon.asm
@@ -16,37 +16,28 @@
 
 ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
 ;
 ; void vpx_lpf_horizontal_4_neon(uint8_t *s,
 ;                                int p /* pitch */,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
-;                                const uint8_t *thresh,
-;                                int count)
+;                                const uint8_t *thresh)
 ;
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-; sp+4  int count
 |vpx_lpf_horizontal_4_neon| PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r12, [sp, #8]              ; load count
     ldr         r2, [sp, #4]               ; load thresh
     add         r1, r1, r1                 ; double pitch
 
-    cmp         r12, #0
-    beq         end_vpx_lf_h_edge
-
     vld1.8      {d1[]}, [r3]               ; duplicate *limit
     vld1.8      {d2[]}, [r2]               ; duplicate *thresh
 
-count_lf_h_loop
     sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
     add         r3, r2, r1, lsr #1         ; set to 3 lines down
 
@@ -69,11 +60,6 @@ count_lf_h_loop
     vst1.u8     {d6}, [r2@64], r1          ; store oq0
     vst1.u8     {d7}, [r3@64], r1          ; store oq1
 
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         count_lf_h_loop
-
-end_vpx_lf_h_edge
     pop         {pc}
     ENDP        ; |vpx_lpf_horizontal_4_neon|
 
diff --git a/vpx_dsp/arm/loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c
index db9ea6a9d..7f3ee70b9 100644
--- a/vpx_dsp/arm/loopfilter_4_neon.c
+++ b/vpx_dsp/arm/loopfilter_4_neon.c
@@ -115,22 +115,18 @@ void vpx_lpf_horizontal_4_neon(
         int pitch,
         const uint8_t *blimit,
         const uint8_t *limit,
-        const uint8_t *thresh,
-        int count) {
+        const uint8_t *thresh) {
     int i;
     uint8_t *s, *psrc;
     uint8x8_t dblimit, dlimit, dthresh;
     uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
 
-    if (count == 0)  // end_vpx_lf_h_edge
-        return;
-
     dblimit = vld1_u8(blimit);
     dlimit = vld1_u8(limit);
     dthresh = vld1_u8(thresh);
 
     psrc = src - (pitch << 2);
-    for (i = 0; i < count; i++) {
+    for (i = 0; i < 1; i++) {
         s = psrc + i * 8;
 
         d3u8 = vld1_u8(s);
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index e8092d912..e545d36ab 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -119,12 +119,12 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
 
 void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
                             const uint8_t *blimit, const uint8_t *limit,
-                            const uint8_t *thresh, int count) {
+                            const uint8_t *thresh) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
     const int8_t mask = filter_mask(*limit, *blimit,
@@ -138,8 +138,8 @@ void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
diff --git a/vpx_dsp/mips/loopfilter_4_msa.c b/vpx_dsp/mips/loopfilter_4_msa.c
index ebeaddd21..936347031 100644
--- a/vpx_dsp/mips/loopfilter_4_msa.c
+++ b/vpx_dsp/mips/loopfilter_4_msa.c
@@ -13,14 +13,11 @@
 void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
                               const uint8_t *b_limit_ptr,
                               const uint8_t *limit_ptr,
-                              const uint8_t *thresh_ptr,
-                              int32_t count) {
+                              const uint8_t *thresh_ptr) {
   uint64_t p1_d, p0_d, q0_d, q1_d;
   v16u8 mask, hev, flat, thresh, b_limit, limit;
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
 
-  (void)count;
-
   /* load vector elements */
   LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
 
diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c
index 8a24372cb..8414b9ed5 100644
--- a/vpx_dsp/mips/loopfilter_filters_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c
@@ -23,8 +23,7 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
                                 int pitch,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
-                                const uint8_t *thresh,
-                                int count) {
+                                const uint8_t *thresh) {
   uint8_t   i;
   uint32_t  mask;
   uint32_t  hev;
@@ -312,8 +311,8 @@ void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1) {
-  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 3f63a5f62..36c89db8f 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -559,7 +559,7 @@ add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint
 specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
 $vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
 
-add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_4 mmx neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
diff --git a/vpx_dsp/x86/loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm
index dee565ce0..15105e3ed 100644
--- a/vpx_dsp/x86/loopfilter_mmx.asm
+++ b/vpx_dsp/x86/loopfilter_mmx.asm
@@ -18,14 +18,13 @@
 ;    int src_pixel_step,
 ;    const char *blimit,
 ;    const char *limit,
-;    const char *thresh,
-;    int  count
+;    const char *thresh
 ;)
 global sym(vpx_lpf_horizontal_4_mmx) PRIVATE
 sym(vpx_lpf_horizontal_4_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
+    SHADOW_ARGS_TO_STACK 5
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -39,8 +38,6 @@ sym(vpx_lpf_horizontal_4_mmx):
         mov         rsi, arg(0) ;src_ptr
         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
 
-        movsxd      rcx, dword ptr arg(5) ;count
-.next8_h:
         mov         rdx, arg(3) ;limit
         movq        mm7, [rdx]
         mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
@@ -208,11 +205,6 @@ sym(vpx_lpf_horizontal_4_mmx):
         pxor        mm7, [GLOBAL(t80)]    ; unoffset
         movq        [rdi], mm7            ; write back
 
-        add         rsi,8
-        neg         rax
-        dec         rcx
-        jnz         .next8_h
-
     add rsp, 32
     pop rsp
     ; begin epilog

From 72a9f06ac2f276d55fc1111012a9b35f6e30b711 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 11 Feb 2016 20:49:56 -0800
Subject: [PATCH 09/16] vpx_highbd_lpf_vertical_8: remove unused count param

Change-Id: Id16f7259897654831d31642c2d5e0bbe5e13416c
---
 test/lpf_8_test.cc                   | 12 ++++++------
 vp10/common/loopfilter.c             |  6 +++---
 vp9/common/vp9_loopfilter.c          |  6 +++---
 vpx_dsp/loopfilter.c                 |  8 ++++----
 vpx_dsp/vpx_dsp_rtcd_defs.pl         |  2 +-
 vpx_dsp/x86/highbd_loopfilter_sse2.c |  3 +--
 6 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index e6fe1e508..c3b2c7807 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -479,8 +479,8 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_highbd_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
                    &vpx_highbd_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
-                   &vpx_highbd_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_8_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_vertical_8_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_16_c>, 8, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
@@ -493,8 +493,8 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_highbd_lpf_horizontal_16_c, 10, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
                    &vpx_highbd_lpf_horizontal_16_c, 10, 2),
-        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
-                   &vpx_highbd_lpf_vertical_8_c, 10, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_8_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_vertical_8_c>, 10, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_16_c>, 10, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
@@ -507,8 +507,8 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_highbd_lpf_horizontal_16_c, 12, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
                    &vpx_highbd_lpf_horizontal_16_c, 12, 2),
-        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
-                   &vpx_highbd_lpf_vertical_8_c, 12, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_8_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_vertical_8_c>, 12, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_16_c>, 12, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_dual_sse2>,
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 1f7ce981f..57480bd4f 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -444,10 +444,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor,
                                          lfi1->hev_thr, bd);
         } else if (mask_8x8_0 & 1) {
           vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, 1, bd);
+                                    lfi0->hev_thr, bd);
         } else {
           vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+                                    lfi1->lim, lfi1->hev_thr, bd);
         }
       }
 
@@ -1161,7 +1161,7 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
                                    lfi->hev_thr, bd);
       } else if (mask_8x8 & 1) {
         vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
-                                  lfi->hev_thr, 1, bd);
+                                  lfi->hev_thr, bd);
       } else if (mask_4x4 & 1) {
         vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 1, bd);
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 7cc833e19..9bf453e37 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -444,10 +444,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor,
                                          lfi1->hev_thr, bd);
         } else if (mask_8x8_0 & 1) {
           vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, 1, bd);
+                                    lfi0->hev_thr, bd);
         } else {
           vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+                                    lfi1->lim, lfi1->hev_thr, bd);
         }
       }
 
@@ -1136,7 +1136,7 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
                                    lfi->hev_thr, bd);
       } else if (mask_8x8 & 1) {
         vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
-                                  lfi->hev_thr, 1, bd);
+                                  lfi->hev_thr, bd);
       } else if (mask_4x4 & 1) {
         vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 1, bd);
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index e545d36ab..2a22f7ee7 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -565,10 +565,10 @@ void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
 
 void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
-                                 int count, int bd) {
+                                 int bd) {
   int i;
 
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     const int8_t mask = highbd_filter_mask(*limit, *blimit,
@@ -591,9 +591,9 @@ void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
                                       const uint8_t *limit1,
                                       const uint8_t *thresh1,
                                       int bd) {
-  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
   vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
-                              thresh1, 1, bd);
+                              thresh1, bd);
 }
 
 static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 36c89db8f..52cd6a889 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -572,7 +572,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
   specialize qw/vpx_highbd_lpf_vertical_16_dual sse2/;
 
-  add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
   specialize qw/vpx_highbd_lpf_vertical_8 sse2/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
index c4fd5e1a0..9fc1f5b9d 100644
--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -1112,11 +1112,10 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh,
-                                    int count, int bd) {
+                                    int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
   uint16_t *dst[1];
-  (void)count;
 
   // Transpose 8x8
   src[0] = s - 4;

From 3c1019e49df424ac0c012a409153cb2bc551a50d Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 11 Feb 2016 20:50:42 -0800
Subject: [PATCH 10/16] vpx_highbd_lpf_vertical_4: remove unused count param

Change-Id: Ic6da723c5cf3cd8127db1f476c3e46ea134cb774
---
 test/lpf_8_test.cc                   | 12 ++++++------
 vp10/common/loopfilter.c             | 12 ++++++------
 vp9/common/vp9_loopfilter.c          | 12 ++++++------
 vpx_dsp/loopfilter.c                 |  8 ++++----
 vpx_dsp/vpx_dsp_rtcd_defs.pl         |  2 +-
 vpx_dsp/x86/highbd_loopfilter_sse2.c |  3 +--
 6 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index c3b2c7807..5d8eb1f1f 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -471,8 +471,8 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
                    &vpx_highbd_lpf_horizontal_4_c, 8, 1),
-        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
-                   &vpx_highbd_lpf_vertical_4_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_4_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 8, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
                    &vpx_highbd_lpf_horizontal_8_c, 8, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
@@ -485,8 +485,8 @@ INSTANTIATE_TEST_CASE_P(
                    &wrapper_nc<vpx_highbd_lpf_vertical_16_c>, 8, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
                    &vpx_highbd_lpf_horizontal_4_c, 10, 1),
-        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
-                   &vpx_highbd_lpf_vertical_4_c, 10, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_4_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 10, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
                    &vpx_highbd_lpf_horizontal_8_c, 10, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
@@ -499,8 +499,8 @@ INSTANTIATE_TEST_CASE_P(
                    &wrapper_nc<vpx_highbd_lpf_vertical_16_c>, 10, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
                    &vpx_highbd_lpf_horizontal_4_c, 12, 1),
-        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
-                   &vpx_highbd_lpf_vertical_4_c, 12, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_4_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 12, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
                    &vpx_highbd_lpf_horizontal_8_c, 12, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 57480bd4f..453b3319e 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -458,10 +458,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor,
                                          lfi1->hev_thr, bd);
         } else if (mask_4x4_0 & 1) {
           vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, 1, bd);
+                                    lfi0->hev_thr, bd);
         } else {
           vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+                                    lfi1->lim, lfi1->hev_thr, bd);
         }
       }
 
@@ -472,10 +472,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor,
                                          lfi1->hev_thr, bd);
         } else if (mask_4x4_int_0 & 1) {
           vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, 1, bd);
+                                    lfi0->hev_thr, bd);
         } else {
           vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+                                    lfi1->lim, lfi1->hev_thr, bd);
         }
       }
     }
@@ -1164,12 +1164,12 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
                                   lfi->hev_thr, bd);
       } else if (mask_4x4 & 1) {
         vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, 1, bd);
+                                  lfi->hev_thr, bd);
       }
     }
     if (mask_4x4_int & 1)
       vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, 1, bd);
+                                lfi->hev_thr, bd);
     s += 8;
     lfl += 1;
     mask_16x16 >>= 1;
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 9bf453e37..9dbec0959 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -458,10 +458,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor,
                                          lfi1->hev_thr, bd);
         } else if (mask_4x4_0 & 1) {
           vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, 1, bd);
+                                    lfi0->hev_thr, bd);
         } else {
           vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+                                    lfi1->lim, lfi1->hev_thr, bd);
         }
       }
 
@@ -472,10 +472,10 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor,
                                          lfi1->hev_thr, bd);
         } else if (mask_4x4_int_0 & 1) {
           vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, 1, bd);
+                                    lfi0->hev_thr, bd);
         } else {
           vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+                                    lfi1->lim, lfi1->hev_thr, bd);
         }
       }
     }
@@ -1139,12 +1139,12 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
                                   lfi->hev_thr, bd);
       } else if (mask_4x4 & 1) {
         vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, 1, bd);
+                                  lfi->hev_thr, bd);
       }
     }
     if (mask_4x4_int & 1)
       vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, 1, bd);
+                                lfi->hev_thr, bd);
     s += 8;
     lfl += 1;
     mask_16x16 >>= 1;
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 2a22f7ee7..740d74757 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -480,12 +480,12 @@ void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
 
 void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
-                                 int count, int bd) {
+                                 int bd) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint16_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
     const int8_t mask = highbd_filter_mask(*limit, *blimit,
@@ -503,9 +503,9 @@ void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
                                       const uint8_t *limit1,
                                       const uint8_t *thresh1,
                                       int bd) {
-  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
   vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
-                              thresh1, 1, bd);
+                              thresh1, bd);
 }
 
 static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 52cd6a889..c2b042271 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -578,7 +578,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
   specialize qw/vpx_highbd_lpf_vertical_8_dual sse2/;
 
-  add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
   specialize qw/vpx_highbd_lpf_vertical_4 sse2/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
index 9fc1f5b9d..a48bbd316 100644
--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -1058,11 +1058,10 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh,
-                                    int count, int bd) {
+                                    int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
   uint16_t *dst[1];
-  (void)count;
 
   // Transpose 8x8
   src[0] = s - 4;

From 51718573295eaf556c9b1d2dab8036837d8adfe7 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 11 Feb 2016 20:54:16 -0800
Subject: [PATCH 11/16] vpx_highbd_lpf_horizontal_8: remove unused count param

Change-Id: Iaca71ea3796115d4c2d43563b4e6f3914e21f1bf
---
 test/lpf_8_test.cc                   | 12 ++++++------
 vp10/common/loopfilter.c             |  2 +-
 vp9/common/vp9_loopfilter.c          |  2 +-
 vpx_dsp/loopfilter.c                 |  8 ++++----
 vpx_dsp/vpx_dsp_rtcd_defs.pl         |  2 +-
 vpx_dsp/x86/highbd_loopfilter_sse2.c | 12 ++++--------
 6 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 5d8eb1f1f..f52d8e8e0 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -473,8 +473,8 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_highbd_lpf_horizontal_4_c, 8, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_4_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 8, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
-                   &vpx_highbd_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_8_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_horizontal_8_c>, 8, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
                    &vpx_highbd_lpf_horizontal_16_c, 8, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
@@ -487,8 +487,8 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_highbd_lpf_horizontal_4_c, 10, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_4_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 10, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
-                   &vpx_highbd_lpf_horizontal_8_c, 10, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_8_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_horizontal_8_c>, 10, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
                    &vpx_highbd_lpf_horizontal_16_c, 10, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
@@ -501,8 +501,8 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_highbd_lpf_horizontal_4_c, 12, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_4_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 12, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
-                   &vpx_highbd_lpf_horizontal_8_c, 12, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_8_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_horizontal_8_c>, 12, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
                    &vpx_highbd_lpf_horizontal_16_c, 12, 1),
         make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 453b3319e..62d34ff26 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -642,7 +642,7 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
           count = 2;
         } else {
           vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1, bd);
+                                      lfi->hev_thr, bd);
 
           if (mask_4x4_int & 1) {
             vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 9dbec0959..d4574e5b5 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -642,7 +642,7 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
           count = 2;
         } else {
           vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1, bd);
+                                      lfi->hev_thr, bd);
 
           if (mask_4x4_int & 1) {
             vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 740d74757..8b740f557 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -531,12 +531,12 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
 
 void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
-                                   int count, int bd) {
+                                   int bd) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 
@@ -559,8 +559,8 @@ void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
                                         const uint8_t *limit1,
                                         const uint8_t *thresh1,
                                         int bd) {
-  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
-  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
 }
 
 void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index c2b042271..1e7800ad7 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -587,7 +587,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
   specialize qw/vpx_highbd_lpf_horizontal_16 sse2/;
 
-  add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
   specialize qw/vpx_highbd_lpf_horizontal_8 sse2/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
index a48bbd316..53786de83 100644
--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -523,7 +523,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
                                       const uint8_t *_thresh,
-                                      int count, int bd) {
+                                      int bd) {
   DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
@@ -556,8 +556,6 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   __m128i work_a;
   __m128i filter1, filter2;
 
-  (void)count;
-
   if (bd == 8) {
     blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
     limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
@@ -764,9 +762,8 @@ void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
                                            const uint8_t *_limit1,
                                            const uint8_t *_thresh1,
                                            int bd) {
-  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
-  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
-                                   1, bd);
+  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
 }
 
 void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
@@ -1123,8 +1120,7 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
   highbd_transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
-                                   bd);
+  vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
 
   src[0] = t_dst;
   dst[0] = s - 4;

From e7a23d703bc8f62fb387d71c0f70121253dede30 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Thu, 11 Feb 2016 20:59:39 -0800
Subject: [PATCH 12/16] vpx_highbd_lpf_horizontal_4: remove unused count param

Change-Id: I655a771e1b1a8753be5669ef9348a312ba6cfdbc
---
 test/lpf_8_test.cc                   | 12 ++++++------
 vp10/common/loopfilter.c             | 16 ++++++++--------
 vp9/common/vp9_loopfilter.c          | 16 ++++++++--------
 vpx_dsp/loopfilter.c                 |  8 ++++----
 vpx_dsp/vpx_dsp_rtcd_defs.pl         |  2 +-
 vpx_dsp/x86/highbd_loopfilter_sse2.c | 12 ++++--------
 6 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index f52d8e8e0..0d898bc8f 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -469,8 +469,8 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
-                   &vpx_highbd_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_4_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_horizontal_4_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_4_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_8_sse2>,
@@ -483,8 +483,8 @@ INSTANTIATE_TEST_CASE_P(
                    &wrapper_nc<vpx_highbd_lpf_vertical_8_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_16_c>, 8, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
-                   &vpx_highbd_lpf_horizontal_4_c, 10, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_4_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_horizontal_4_c>, 10, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_4_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 10, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_8_sse2>,
@@ -497,8 +497,8 @@ INSTANTIATE_TEST_CASE_P(
                    &wrapper_nc<vpx_highbd_lpf_vertical_8_c>, 10, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_16_c>, 10, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
-                   &vpx_highbd_lpf_horizontal_4_c, 12, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_4_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_horizontal_4_c>, 12, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_4_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 12, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_8_sse2>,
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 62d34ff26..a659aaea4 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -633,10 +633,10 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
           } else {
             if (mask_4x4_int & 1) {
               vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                          lfi->lim, lfi->hev_thr, 1, bd);
+                                          lfi->lim, lfi->hev_thr, bd);
             } else if (mask_4x4_int & 2) {
               vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                          lfin->lim, lfin->hev_thr, 1, bd);
+                                          lfin->lim, lfin->hev_thr, bd);
             }
           }
           count = 2;
@@ -646,7 +646,7 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
 
           if (mask_4x4_int & 1) {
             vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                        lfi->lim, lfi->hev_thr, 1, bd);
+                                        lfi->lim, lfi->hev_thr, bd);
           }
         }
       } else if (mask_4x4 & 1) {
@@ -665,25 +665,25 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
           } else {
             if (mask_4x4_int & 1) {
               vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                          lfi->lim, lfi->hev_thr, 1, bd);
+                                          lfi->lim, lfi->hev_thr, bd);
             } else if (mask_4x4_int & 2) {
               vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                          lfin->lim, lfin->hev_thr, 1, bd);
+                                          lfin->lim, lfin->hev_thr, bd);
             }
           }
           count = 2;
         } else {
           vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1, bd);
+                                      lfi->hev_thr, bd);
 
           if (mask_4x4_int & 1) {
             vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                        lfi->lim, lfi->hev_thr, 1, bd);
+                                        lfi->lim, lfi->hev_thr, bd);
           }
         }
       } else if (mask_4x4_int & 1) {
         vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, 1, bd);
+                                    lfi->hev_thr, bd);
       }
     }
     s += 8 * count;
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index d4574e5b5..18420eff8 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -633,10 +633,10 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
           } else {
             if (mask_4x4_int & 1) {
               vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                          lfi->lim, lfi->hev_thr, 1, bd);
+                                          lfi->lim, lfi->hev_thr, bd);
             } else if (mask_4x4_int & 2) {
               vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                          lfin->lim, lfin->hev_thr, 1, bd);
+                                          lfin->lim, lfin->hev_thr, bd);
             }
           }
           count = 2;
@@ -646,7 +646,7 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
 
           if (mask_4x4_int & 1) {
             vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                        lfi->lim, lfi->hev_thr, 1, bd);
+                                        lfi->lim, lfi->hev_thr, bd);
           }
         }
       } else if (mask_4x4 & 1) {
@@ -665,25 +665,25 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
           } else {
             if (mask_4x4_int & 1) {
               vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                          lfi->lim, lfi->hev_thr, 1, bd);
+                                          lfi->lim, lfi->hev_thr, bd);
             } else if (mask_4x4_int & 2) {
               vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                          lfin->lim, lfin->hev_thr, 1, bd);
+                                          lfin->lim, lfin->hev_thr, bd);
             }
           }
           count = 2;
         } else {
           vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1, bd);
+                                      lfi->hev_thr, bd);
 
           if (mask_4x4_int & 1) {
             vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                        lfi->lim, lfi->hev_thr, 1, bd);
+                                        lfi->lim, lfi->hev_thr, bd);
           }
         }
       } else if (mask_4x4_int & 1) {
         vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, 1, bd);
+                                    lfi->hev_thr, bd);
       }
     }
     s += 8 * count;
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 8b740f557..0ca1d9d8c 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -445,12 +445,12 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
 
 void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
                                    const uint8_t *blimit, const uint8_t *limit,
-                                   const uint8_t *thresh, int count, int bd) {
+                                   const uint8_t *thresh, int bd) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint16_t p3 = s[-4 * p];
     const uint16_t p2 = s[-3 * p];
     const uint16_t p1 = s[-2 * p];
@@ -474,8 +474,8 @@ void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
                                         const uint8_t *limit1,
                                         const uint8_t *thresh1,
                                         int bd) {
-  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
-  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
 }
 
 void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 1e7800ad7..17f11eb89 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -593,7 +593,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
   specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/;
 
-  add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
   specialize qw/vpx_highbd_lpf_horizontal_4 sse2/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
index 53786de83..73deb733f 100644
--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -770,7 +770,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
                                       const uint8_t *_thresh,
-                                      int count, int bd) {
+                                      int bd) {
   const __m128i zero = _mm_set1_epi16(0);
   __m128i blimit, limit, thresh;
   __m128i mask, hev, flat;
@@ -810,8 +810,6 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   __m128i work_a;
   __m128i filter1, filter2;
 
-  (void)count;
-
   if (bd == 8) {
     blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
     limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
@@ -941,9 +939,8 @@ void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
                                            const uint8_t *_limit1,
                                            const uint8_t *_thresh1,
                                            int bd) {
-  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
-  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
-                                   bd);
+  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
 }
 
 static INLINE void highbd_transpose(uint16_t *src[], int in_p,
@@ -1067,8 +1064,7 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
   highbd_transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
-                                   bd);
+  vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
 
   src[0] = t_dst;
   dst[0] = s - 4;

From 1b519fb666e79f25d93c78d3b90e92e057caa997 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 12 Feb 2016 17:42:34 -0800
Subject: [PATCH 13/16] split vpx_lpf_horizontal_16 in two

replace with vpx_lpf_horizontal_edge_16 and vpx_lpf_horizontal_edge_8 to
avoid passing a count parameter

Change-Id: I848c95c02a3c6ebaa6c2bdf0983dce05cd645271
---
 test/lpf_8_test.cc                       | 35 ++++++++++--------
 vp10/common/loopfilter.c                 |  8 ++--
 vp9/common/vp9_loopfilter.c              |  8 ++--
 vpx_dsp/arm/loopfilter_mb_neon.asm       | 47 +++++++++++++++++++-----
 vpx_dsp/loopfilter.c                     | 16 ++++++--
 vpx_dsp/mips/loopfilter_16_msa.c         | 24 +++++++++---
 vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c | 26 ++++++++++---
 vpx_dsp/vpx_dsp_rtcd_defs.pl             | 10 +++--
 vpx_dsp/x86/loopfilter_avx2.c            | 23 ++++--------
 vpx_dsp/x86/loopfilter_sse2.c            | 34 +++++------------
 10 files changed, 143 insertions(+), 88 deletions(-)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 0d898bc8f..9ddbf71cb 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -523,8 +523,10 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_sse2>,
                    &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8_sse2>,
+                   &wrapper_nc<vpx_lpf_horizontal_edge_8_c>, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16_sse2>,
+                   &wrapper_nc<vpx_lpf_horizontal_edge_16_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_8_sse2>,
                    &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_sse2>,
@@ -538,9 +540,10 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     AVX2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8,
-                   2)));
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8_avx2>,
+                   &wrapper_nc<vpx_lpf_horizontal_edge_8_c>, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16_avx2>,
+                   &wrapper_nc<vpx_lpf_horizontal_edge_16_c>, 8, 1)));
 #endif
 
 #if HAVE_SSE2
@@ -597,10 +600,10 @@ INSTANTIATE_TEST_CASE_P(
 #if HAVE_NEON_ASM
 // Using #if inside the macro is unsupported on MSVS but the tests are not
 // currently built for MSVS with ARM and NEON.
-        make_tuple(&vpx_lpf_horizontal_16_neon,
-                   &vpx_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_neon,
-                   &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8_neon>,
+                   &wrapper_nc<vpx_lpf_horizontal_edge_8_c>, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16_neon>,
+                   &wrapper_nc<vpx_lpf_horizontal_edge_16_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_neon>,
                    &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dual_neon>,
@@ -638,10 +641,10 @@ INSTANTIATE_TEST_CASE_P(
                    &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_dspr2>,
                    &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_dspr2,
-                   &vpx_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_dspr2,
-                   &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8>,
+                   &wrapper_nc<vpx_lpf_horizontal_edge_8>, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16>,
+                   &wrapper_nc<vpx_lpf_horizontal_edge_16>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_4_dspr2>,
                    &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_8_dspr2>,
@@ -672,8 +675,10 @@ INSTANTIATE_TEST_CASE_P(
                    &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_msa>,
                    &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8_msa>,
+                   &wrapper_nc<vpx_lpf_horizontal_edge_8_c>, 8, 1),
+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16_msa>,
+                   &wrapper_nc<vpx_lpf_horizontal_edge_16_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_4_msa>,
                    &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_lpf_vertical_8_msa>,
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index a659aaea4..3fe316db8 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -512,12 +512,12 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
     if (mask & 1) {
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, 2);
+          vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
+                                     lfi->hev_thr);
           count = 2;
         } else {
-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, 1);
+          vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr);
         }
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 18420eff8..aae0a33fa 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -512,12 +512,12 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
     if (mask & 1) {
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, 2);
+          vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
+                                     lfi->hev_thr);
           count = 2;
         } else {
-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, 1);
+          vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr);
         }
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
diff --git a/vpx_dsp/arm/loopfilter_mb_neon.asm b/vpx_dsp/arm/loopfilter_mb_neon.asm
index 20d9cfb11..d5da7a840 100644
--- a/vpx_dsp/arm/loopfilter_mb_neon.asm
+++ b/vpx_dsp/arm/loopfilter_mb_neon.asm
@@ -8,27 +8,28 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vpx_lpf_horizontal_16_neon|
+    EXPORT  |vpx_lpf_horizontal_edge_8_neon|
+    EXPORT  |vpx_lpf_horizontal_edge_16_neon|
     EXPORT  |vpx_lpf_vertical_16_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; void vpx_lpf_horizontal_16_neon(uint8_t *s, int p,
-;                                 const uint8_t *blimit,
-;                                 const uint8_t *limit,
-;                                 const uint8_t *thresh
-;                                 int count)
+; void mb_lpf_horizontal_edge(uint8_t *s, int p,
+;                             const uint8_t *blimit,
+;                             const uint8_t *limit,
+;                             const uint8_t *thresh,
+;                             int count)
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vpx_lpf_horizontal_16_neon| PROC
+; r12   int count
+|mb_lpf_horizontal_edge| PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              ; load thresh
-    ldr         r12, [sp, #92]             ; load count
 
 h_count
     vld1.8      {d16[]}, [r2]              ; load *blimit
@@ -115,7 +116,35 @@ h_next
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-    ENDP        ; |vpx_lpf_horizontal_16_neon|
+    ENDP        ; |mb_lpf_horizontal_edge|
+
+; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
+;                                     const uint8_t *blimit,
+;                                     const uint8_t *limit,
+;                                     const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int pitch,
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh
+|vpx_lpf_horizontal_edge_8_neon| PROC
+    mov r12, #1
+    b mb_lpf_horizontal_edge
+    ENDP        ; |vpx_lpf_horizontal_edge_8_neon|
+
+; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
+;                                      const uint8_t *blimit,
+;                                      const uint8_t *limit,
+;                                      const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int pitch,
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh
+|vpx_lpf_horizontal_edge_16_neon| PROC
+    mov r12, #2
+    b mb_lpf_horizontal_edge
+    ENDP        ; |vpx_lpf_horizontal_edge_16_neon|
 
 ; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
 ;                               const uint8_t *blimit,
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 0ca1d9d8c..f866a3dcf 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -289,9 +289,9 @@ static INLINE void filter16(int8_t mask, uint8_t thresh,
   }
 }
 
-void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh,
-                             int count) {
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int count) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
@@ -315,6 +315,16 @@ void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
   }
 }
 
+void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+}
+
+void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
+}
+
 static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
                                    const uint8_t *blimit,
                                    const uint8_t *limit,
diff --git a/vpx_dsp/mips/loopfilter_16_msa.c b/vpx_dsp/mips/loopfilter_16_msa.c
index b7c9f7bd0..a6c581d72 100644
--- a/vpx_dsp/mips/loopfilter_16_msa.c
+++ b/vpx_dsp/mips/loopfilter_16_msa.c
@@ -423,11 +423,11 @@ void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
   }
 }
 
-void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
-                               const uint8_t *b_limit_ptr,
-                               const uint8_t *limit_ptr,
-                               const uint8_t *thresh_ptr,
-                               int32_t count) {
+static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit_ptr,
+                                   const uint8_t *limit_ptr,
+                                   const uint8_t *thresh_ptr,
+                                   int32_t count) {
   if (1 == count) {
     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
     uint64_t dword0, dword1;
@@ -648,6 +648,20 @@ void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
   }
 }
 
+void vpx_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit_ptr,
+                                   const uint8_t *limit_ptr,
+                                   const uint8_t *thresh_ptr) {
+  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
+}
+
+void vpx_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
+}
+
 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
                                    uint8_t *output, int32_t out_pitch) {
   v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
diff --git a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
index 8a4865073..85e167ca0 100644
--- a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
@@ -19,12 +19,12 @@
 #include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vpx_lpf_horizontal_16_dspr2(unsigned char *s,
-                                 int pitch,
-                                 const uint8_t *blimit,
-                                 const uint8_t *limit,
-                                 const uint8_t *thresh,
-                                 int count) {
+static void mb_lpf_horizontal_edge(unsigned char *s,
+                                   int pitch,
+                                   const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh,
+                                   int count) {
   uint32_t  mask;
   uint32_t  hev, flat, flat2;
   uint8_t   i;
@@ -791,4 +791,18 @@ void vpx_lpf_horizontal_16_dspr2(unsigned char *s,
     s = s + 4;
   }
 }
+
+void vpx_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh) {
+  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
+}
+
+void vpx_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh) {
+  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2);
+}
 #endif  // #if HAVE_DSPR2
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 17f11eb89..557b4c55f 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -548,9 +548,13 @@ specialize qw/vpx_lpf_vertical_4 mmx neon dspr2 msa/;
 add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
 
-add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/;
-$vpx_lpf_horizontal_16_neon_asm=vpx_lpf_horizontal_16_neon;
+add_proto qw/void vpx_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/;
+$vpx_lpf_horizontal_edge_8_neon_asm=vpx_lpf_horizontal_edge_8_neon;
+
+add_proto qw/void vpx_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/;
+$vpx_lpf_horizontal_edge_16_neon_asm=vpx_lpf_horizontal_edge_16_neon;
 
 add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
diff --git a/vpx_dsp/x86/loopfilter_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c
index 23a97dd05..be1087c1e 100644
--- a/vpx_dsp/x86/loopfilter_avx2.c
+++ b/vpx_dsp/x86/loopfilter_avx2.c
@@ -13,9 +13,10 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
-        const unsigned char *_blimit, const unsigned char *_limit,
-        const unsigned char *_thresh) {
+void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,
+                                    const unsigned char *_blimit,
+                                    const unsigned char *_limit,
+                                    const unsigned char *_thresh) {
     __m128i mask, hev, flat, flat2;
     const __m128i zero = _mm_set1_epi16(0);
     const __m128i one = _mm_set1_epi8(1);
@@ -400,9 +401,10 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
 };
 
-static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
-        const unsigned char *_blimit, const unsigned char *_limit,
-        const unsigned char *_thresh) {
+void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,
+                                     const unsigned char *_blimit,
+                                     const unsigned char *_limit,
+                                     const unsigned char *_thresh) {
     __m128i mask, hev, flat, flat2;
     const __m128i zero = _mm_set1_epi16(0);
     const __m128i one = _mm_set1_epi8(1);
@@ -975,12 +977,3 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
         _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
     }
 }
-
-void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
-        const unsigned char *_blimit, const unsigned char *_limit,
-        const unsigned char *_thresh, int count) {
-    if (count == 1)
-        mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
-    else
-        mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
-}
diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
index e1236dc4d..e03508a03 100644
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -18,11 +18,10 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 }
 
-static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
-                                            int p,
-                                            const unsigned char *_blimit,
-                                            const unsigned char *_limit,
-                                            const unsigned char *_thresh) {
+void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit,
+                                    const unsigned char *_limit,
+                                    const unsigned char *_thresh) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
@@ -383,11 +382,10 @@ static INLINE __m128i filter16_mask(const __m128i *const flat,
   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
 }
 
-static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
-                                             int p,
-                                             const unsigned char *_blimit,
-                                             const unsigned char *_limit,
-                                             const unsigned char *_thresh) {
+void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
+                                     const unsigned char *_blimit,
+                                     const unsigned char *_limit,
+                                     const unsigned char *_thresh) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
@@ -716,17 +714,6 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
   }
 }
 
-// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
-void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh, int count) {
-  if (count == 1)
-    mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
-  else
-    mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
-}
-
 void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
                                const unsigned char *_blimit,
                                const unsigned char *_limit,
@@ -1554,7 +1541,7 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
   transpose(src, p, dst, 8, 2);
 
   // Loop filtering
-  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
+  vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
 
   src[0] = t_dst;
   src[1] = t_dst + 8 * 8;
@@ -1575,8 +1562,7 @@ void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
   transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
 
   // Loop filtering
-  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
-                                   thresh);
+  vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
 
   // Transpose back
   transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);

From 9b44d9d00fcf015f9a8ab5cde7ee5d62c00a0495 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 12 Feb 2016 18:12:57 -0800
Subject: [PATCH 14/16] split vpx_highbd_lpf_horizontal_16 in two

replace with vpx_highbd_lpf_horizontal_edge_16 and
vpx_highbd_lpf_horizontal_edge_8 to avoid passing a count parameter

Change-Id: I551f8cec0fce57032cb2652584bb802e2248644d
---
 test/lpf_8_test.cc                   | 24 +++++++--------
 vp10/common/loopfilter.c             |  8 ++---
 vp9/common/vp9_loopfilter.c          |  8 ++---
 vpx_dsp/loopfilter.c                 | 22 ++++++++++++--
 vpx_dsp/vpx_dsp_rtcd_defs.pl         |  7 +++--
 vpx_dsp/x86/highbd_loopfilter_sse2.c | 45 +++++++++-------------------
 6 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 9ddbf71cb..13a4c476f 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -475,10 +475,10 @@ INSTANTIATE_TEST_CASE_P(
                    &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_8_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_horizontal_8_c>, 8, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
-                   &vpx_highbd_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
-                   &vpx_highbd_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_edge_8_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_horizontal_edge_8_c>, 8, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_edge_16_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_horizontal_edge_16_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_8_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_8_c>, 8, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
@@ -489,10 +489,10 @@ INSTANTIATE_TEST_CASE_P(
                    &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 10, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_8_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_horizontal_8_c>, 10, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
-                   &vpx_highbd_lpf_horizontal_16_c, 10, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
-                   &vpx_highbd_lpf_horizontal_16_c, 10, 2),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_edge_8_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_horizontal_edge_8_c>, 10, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_edge_16_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_horizontal_edge_16_c>, 10, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_8_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_8_c>, 10, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
@@ -503,10 +503,10 @@ INSTANTIATE_TEST_CASE_P(
                    &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 12, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_8_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_horizontal_8_c>, 12, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
-                   &vpx_highbd_lpf_horizontal_16_c, 12, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
-                   &vpx_highbd_lpf_horizontal_16_c, 12, 2),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_edge_8_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_horizontal_edge_8_c>, 12, 1),
+        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_edge_16_sse2>,
+                   &wrapper_nc<vpx_highbd_lpf_horizontal_edge_16_c>, 12, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_8_sse2>,
                    &wrapper_nc<vpx_highbd_lpf_vertical_8_c>, 12, 1),
         make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 3fe316db8..11dfe6d70 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -609,12 +609,12 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
     if (mask & 1) {
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, 2, bd);
+          vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
+                                            lfi->hev_thr, bd);
           count = 2;
         } else {
-          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, 1, bd);
+          vpx_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, bd);
         }
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index aae0a33fa..ee20cc557 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -609,12 +609,12 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
     if (mask & 1) {
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, 2, bd);
+          vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
+                                            lfi->hev_thr, bd);
           count = 2;
         } else {
-          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, 1, bd);
+          vpx_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, bd);
         }
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index f866a3dcf..46ef64617 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -669,9 +669,11 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
   }
 }
 
-void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int count, int bd) {
+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
+                                            const uint8_t *blimit,
+                                            const uint8_t *limit,
+                                            const uint8_t *thresh,
+                                            int count, int bd) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
@@ -703,6 +705,20 @@ void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
   }
 }
 
+void vpx_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p,
+                                        const uint8_t *blimit,
+                                        const uint8_t *limit,
+                                        const uint8_t *thresh, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+}
+
+void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
+                                         const uint8_t *blimit,
+                                         const uint8_t *limit,
+                                         const uint8_t *thresh, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
+}
+
 static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 557b4c55f..d7835f4a7 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -588,8 +588,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
   specialize qw/vpx_highbd_lpf_vertical_4_dual sse2/;
 
-  add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_16 sse2/;
+  add_proto qw/void vpx_highbd_lpf_horizontal_edge_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_edge_8 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_edge_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_edge_16 sse2/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
   specialize qw/vpx_highbd_lpf_horizontal_8 sse2/;
diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
index 73deb733f..72e42adc9 100644
--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -51,12 +51,10 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
 
 // TODO(debargha, peter): Break up large functions into smaller ones
 // in this file.
-static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
-                                                   int p,
-                                                   const uint8_t *_blimit,
-                                                   const uint8_t *_limit,
-                                                   const uint8_t *_thresh,
-                                                   int bd) {
+void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
+                                           const uint8_t *_blimit,
+                                           const uint8_t *_limit,
+                                           const uint8_t *_thresh, int bd) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi16(1);
   __m128i blimit, limit, thresh;
@@ -496,27 +494,12 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
   _mm_store_si128((__m128i *)(s - 0 * p), q0);
 }
 
-static void highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t *s,
-                                                    int p,
-                                                    const uint8_t *_blimit,
-                                                    const uint8_t *_limit,
-                                                    const uint8_t *_thresh,
-                                                    int bd) {
-  highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
-  highbd_mb_lpf_horizontal_edge_w_sse2_8(s + 8, p, _blimit, _limit, _thresh,
-                                         bd);
-}
-
-// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
-void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
-                                       const uint8_t *_blimit,
-                                       const uint8_t *_limit,
-                                       const uint8_t *_thresh,
-                                       int count, int bd) {
-  if (count == 1)
-    highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
-  else
-    highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd);
+void vpx_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p,
+                                            const uint8_t *_blimit,
+                                            const uint8_t *_limit,
+                                            const uint8_t *_thresh, int bd) {
+  vpx_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd);
+  vpx_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
 }
 
 void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
@@ -1171,8 +1154,8 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
   highbd_transpose(src, p, dst, 8, 2);
 
   // Loop filtering
-  highbd_mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit,
-                                         thresh, bd);
+  vpx_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit,
+                                        thresh, bd);
   src[0] = t_dst;
   src[1] = t_dst + 8 * 8;
   dst[0] = s - 8;
@@ -1195,8 +1178,8 @@ void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
   highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
 
   //  Loop filtering
-  highbd_mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
-                                          thresh, bd);
+  vpx_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit,
+                                         thresh, bd);
 
   //  Transpose back
   highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);

From 110d3778993a4c75353b9ec4e6de19bcd0646570 Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Fri, 12 Feb 2016 18:17:54 -0800
Subject: [PATCH 15/16] remove loopfilter 'count' param TODOs

Change-Id: I25ce7314372ce2f521526ea7864ffc4ab62e4519
---
 vp10/common/loopfilter.c    | 2 --
 vp9/common/vp9_loopfilter.c | 2 --
 2 files changed, 4 deletions(-)

diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 11dfe6d70..8f4fc8ccd 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -324,7 +324,6 @@ static void filter_selectively_vert_row2(int subsampling_factor,
     const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
     const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
 
-    // TODO(yunqingwang): count in loopfilter functions should be removed.
     if (mask & 1) {
       if ((mask_16x16_0 | mask_16x16_1) & 1) {
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
@@ -422,7 +421,6 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor,
     const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
     const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
 
-    // TODO(yunqingwang): count in loopfilter functions should be removed.
     if (mask & 1) {
       if ((mask_16x16_0 | mask_16x16_1) & 1) {
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index ee20cc557..aca69bd0f 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -324,7 +324,6 @@ static void filter_selectively_vert_row2(int subsampling_factor,
     const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
     const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
 
-    // TODO(yunqingwang): count in loopfilter functions should be removed.
     if (mask & 1) {
       if ((mask_16x16_0 | mask_16x16_1) & 1) {
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
@@ -422,7 +421,6 @@ static void highbd_filter_selectively_vert_row2(int subsampling_factor,
     const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
     const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
 
-    // TODO(yunqingwang): count in loopfilter functions should be removed.
     if (mask & 1) {
       if ((mask_16x16_0 | mask_16x16_1) & 1) {
         if ((mask_16x16_0 & mask_16x16_1) & 1) {

From 3ea537c0eeb60d33b5661e965384ca4a2ecdcded Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Sat, 13 Feb 2016 11:05:24 -0800
Subject: [PATCH 16/16] lpf_8_test: remove unneeded function wrapper

the count parameter has been removed from all loopfilter functions

Change-Id: I87ba72006b59c65c46ca40bcb1c29171dfe0598a
---
 test/lpf_8_test.cc | 274 +++++++++++++++++++++------------------------
 1 file changed, 125 insertions(+), 149 deletions(-)

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 13a4c476f..b16f14c8e 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -37,43 +37,21 @@ const int number_of_iterations = 10000;
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef void (*loop_op_t)(uint16_t *s, int p, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh,
-                          int count, int bd);
+                          int bd);
 typedef void (*dual_loop_op_t)(uint16_t *s, int p, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1, int bd);
-
-// wrapper for loopfilter functions without a 'count' param.
-typedef void (*loop_op_nc_t)(uint16_t *s, int p, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh,
-                             int bd);
-template <loop_op_nc_t fn>
-void wrapper_nc(uint16_t *s, int p, const uint8_t *blimit,
-                const uint8_t *limit, const uint8_t *thresh,
-                int /*count*/, int bd) {
-  fn(s, p, blimit, limit, thresh, bd);
-}
 #else
 typedef void (*loop_op_t)(uint8_t *s, int p, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh,
-                          int count);
+                          const uint8_t *limit, const uint8_t *thresh);
 typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1);
-
-// wrapper for loopfilter functions without a 'count' param.
-typedef void (*loop_op_nc_t)(uint8_t *s, int p, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh);
-template <loop_op_nc_t fn>
-void wrapper_nc(uint8_t *s, int p, const uint8_t *blimit,
-                const uint8_t *limit, const uint8_t *thresh,
-                int /*count*/) {
-  fn(s, p, blimit, limit, thresh);
-}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-typedef std::tr1::tuple<loop_op_t, loop_op_t, int, int> loop8_param_t;
+typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
 typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
 
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
@@ -83,7 +61,6 @@ class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
     loopfilter_op_ = GET_PARAM(0);
     ref_loopfilter_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
-    count_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
   }
 
@@ -91,7 +68,6 @@ class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
 
  protected:
   int bit_depth_;
-  int count_;
   int mask_;
   loop_op_t loopfilter_op_;
   loop_op_t ref_loopfilter_op_;
@@ -178,13 +154,13 @@ TEST_P(Loop8Test6Param, OperationCheck) {
       ref_s[j] = s[j];
     }
 #if CONFIG_VP9_HIGHBITDEPTH
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bd);
     ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd));
 #else
-    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh);
     ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     for (int j = 0; j < kNumCoeffs; ++j) {
@@ -250,13 +226,13 @@ TEST_P(Loop8Test6Param, ValueCheck) {
       ref_s[j] = s[j];
     }
 #if CONFIG_VP9_HIGHBITDEPTH
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bd);
     ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd));
 #else
-    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh);
     ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     for (int j = 0; j < kNumCoeffs; ++j) {
       err_count += ref_s[j] != s[j];
@@ -458,10 +434,10 @@ using std::tr1::make_tuple;
 INSTANTIATE_TEST_CASE_P(
     MMX, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_mmx>,
-                   &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_4_mmx>,
-                   &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1)));
+        make_tuple(&vpx_lpf_horizontal_4_mmx,
+                   &vpx_lpf_horizontal_4_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_mmx,
+                   &vpx_lpf_vertical_4_c, 8)));
 #endif  // HAVE_MMX
 
 #if HAVE_SSE2
@@ -469,70 +445,70 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_4_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_horizontal_4_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_4_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_8_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_horizontal_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_edge_8_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_horizontal_edge_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_edge_16_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_horizontal_edge_16_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_8_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_vertical_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_vertical_16_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_4_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_horizontal_4_c>, 10, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_4_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 10, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_8_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_horizontal_8_c>, 10, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_edge_8_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_horizontal_edge_8_c>, 10, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_edge_16_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_horizontal_edge_16_c>, 10, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_8_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_vertical_8_c>, 10, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_vertical_16_c>, 10, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_4_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_horizontal_4_c>, 12, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_4_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_vertical_4_c>, 12, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_8_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_horizontal_8_c>, 12, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_edge_8_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_horizontal_edge_8_c>, 12, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_horizontal_edge_16_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_horizontal_edge_16_c>, 12, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_8_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_vertical_8_c>, 12, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_vertical_16_c>, 12, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_dual_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_vertical_16_dual_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_dual_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_vertical_16_dual_c>, 10, 1),
-        make_tuple(&wrapper_nc<vpx_highbd_lpf_vertical_16_dual_sse2>,
-                   &wrapper_nc<vpx_highbd_lpf_vertical_16_dual_c>, 12, 1)));
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2,
+                   &vpx_highbd_lpf_horizontal_edge_8_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2,
+                   &vpx_highbd_lpf_horizontal_edge_16_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_16_sse2,
+                   &vpx_highbd_lpf_vertical_16_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2,
+                   &vpx_highbd_lpf_horizontal_edge_8_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2,
+                   &vpx_highbd_lpf_horizontal_edge_16_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_16_sse2,
+                   &vpx_highbd_lpf_vertical_16_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 12),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 12),
+        make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2,
+                   &vpx_highbd_lpf_horizontal_edge_8_c, 12),
+        make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2,
+                   &vpx_highbd_lpf_horizontal_edge_16_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_16_sse2,
+                   &vpx_highbd_lpf_vertical_16_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2,
+                   &vpx_highbd_lpf_vertical_16_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2,
+                   &vpx_highbd_lpf_vertical_16_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2,
+                   &vpx_highbd_lpf_vertical_16_dual_c, 12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_sse2>,
-                   &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8_sse2>,
-                   &wrapper_nc<vpx_lpf_horizontal_edge_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16_sse2>,
-                   &wrapper_nc<vpx_lpf_horizontal_edge_16_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_8_sse2>,
-                   &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_sse2>,
-                   &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dual_sse2>,
-                   &wrapper_nc<vpx_lpf_vertical_16_dual_c>, 8, 1)));
+        make_tuple(&vpx_lpf_horizontal_8_sse2,
+                   &vpx_lpf_horizontal_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_8_sse2,
+                   &vpx_lpf_horizontal_edge_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_16_sse2,
+                   &vpx_lpf_horizontal_edge_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_sse2,
+                   &vpx_lpf_vertical_8_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_sse2,
+                   &vpx_lpf_vertical_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_dual_sse2,
+                   &vpx_lpf_vertical_16_dual_c, 8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
 
@@ -540,10 +516,10 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     AVX2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8_avx2>,
-                   &wrapper_nc<vpx_lpf_horizontal_edge_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16_avx2>,
-                   &wrapper_nc<vpx_lpf_horizontal_edge_16_c>, 8, 1)));
+        make_tuple(&vpx_lpf_horizontal_edge_8_avx2,
+                   &vpx_lpf_horizontal_edge_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_16_avx2,
+                   &vpx_lpf_horizontal_edge_16_c, 8)));
 #endif
 
 #if HAVE_SSE2
@@ -600,23 +576,23 @@ INSTANTIATE_TEST_CASE_P(
 #if HAVE_NEON_ASM
 // Using #if inside the macro is unsupported on MSVS but the tests are not
 // currently built for MSVS with ARM and NEON.
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8_neon>,
-                   &wrapper_nc<vpx_lpf_horizontal_edge_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16_neon>,
-                   &wrapper_nc<vpx_lpf_horizontal_edge_16_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_neon>,
-                   &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dual_neon>,
-                   &wrapper_nc<vpx_lpf_vertical_16_dual_c>, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_edge_8_neon,
+                   &vpx_lpf_horizontal_edge_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_16_neon,
+                   &vpx_lpf_horizontal_edge_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_neon,
+                   &vpx_lpf_vertical_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_dual_neon,
+                   &vpx_lpf_vertical_16_dual_c, 8),
 #endif  // HAVE_NEON_ASM
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_neon>,
-                   &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_8_neon>,
-                   &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_neon>,
-                   &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_4_neon>,
-                   &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1)));
+        make_tuple(&vpx_lpf_horizontal_8_neon,
+                   &vpx_lpf_horizontal_8_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_neon,
+                   &vpx_lpf_vertical_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_4_neon,
+                   &vpx_lpf_horizontal_4_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_neon,
+                   &vpx_lpf_vertical_4_c, 8)));
 INSTANTIATE_TEST_CASE_P(
     NEON, Loop8Test9Param,
     ::testing::Values(
@@ -637,22 +613,22 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     DSPR2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_dspr2>,
-                   &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_dspr2>,
-                   &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8>,
-                   &wrapper_nc<vpx_lpf_horizontal_edge_8>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16>,
-                   &wrapper_nc<vpx_lpf_horizontal_edge_16>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_4_dspr2>,
-                   &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_8_dspr2>,
-                   &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dspr2>,
-                   &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_dual_dspr2>,
-                   &wrapper_nc<vpx_lpf_vertical_16_dual_c>, 8, 1)));
+        make_tuple(&vpx_lpf_horizontal_4_dspr2,
+                   &vpx_lpf_horizontal_4_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dspr2,
+                   &vpx_lpf_horizontal_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_8,
+                   &vpx_lpf_horizontal_edge_8, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_16,
+                   &vpx_lpf_horizontal_edge_16, 8),
+        make_tuple(&vpx_lpf_vertical_4_dspr2,
+                   &vpx_lpf_vertical_4_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dspr2,
+                   &vpx_lpf_vertical_8_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_dspr2,
+                   &vpx_lpf_vertical_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_dual_dspr2,
+                   &vpx_lpf_vertical_16_dual_c, 8)));
 
 INSTANTIATE_TEST_CASE_P(
     DSPR2, Loop8Test9Param,
@@ -671,20 +647,20 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_msa>,
-                   &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_msa>,
-                   &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_8_msa>,
-                   &wrapper_nc<vpx_lpf_horizontal_edge_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_horizontal_edge_16_msa>,
-                   &wrapper_nc<vpx_lpf_horizontal_edge_16_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_4_msa>,
-                   &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_8_msa>,
-                   &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),
-        make_tuple(&wrapper_nc<vpx_lpf_vertical_16_msa>,
-                   &wrapper_nc<vpx_lpf_vertical_16_c>, 8, 1)));
+        make_tuple(&vpx_lpf_horizontal_4_msa,
+                   &vpx_lpf_horizontal_4_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_msa,
+                   &vpx_lpf_horizontal_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_8_msa,
+                   &vpx_lpf_horizontal_edge_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_16_msa,
+                   &vpx_lpf_horizontal_edge_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_msa,
+                   &vpx_lpf_vertical_4_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_msa,
+                   &vpx_lpf_vertical_8_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_msa,
+                   &vpx_lpf_vertical_16_c, 8)));
 
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test9Param,