NEON intrinsics for 4 loopfilter functions

New NEON intrinsics functions: vpx_lpf_horizontal_edge_8_neon() vpx_lpf_horizontal_edge_16_neon() vpx_lpf_vertical_16_neon() vpx_lpf_vertical_16_dual_neon() BUG=webm:1262, webm:1263, webm:1264, webm:1265. Change-Id: I7a2aff2a358b22277429329adec606e08efbc8cb
2016-08-03 11:42:33 -07:00 · 2016-08-03 11:42:33 -07:00 · f09b5a3328
commit f09b5a3328
parent f1e12c1bf3
8 changed files with 718 additions and 38 deletions
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@ -520,9 +520,6 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
    NEON, Loop8Test6Param,
    ::testing::Values(
-// Using #if inside the macro is unsupported on MSVS but the tests are not
-// currently built for MSVS with ARM and NEON.
-#if HAVE_NEON_ASM
        make_tuple(&vpx_lpf_horizontal_edge_8_neon,
                   &vpx_lpf_horizontal_edge_8_c, 8),
        make_tuple(&vpx_lpf_horizontal_edge_16_neon,
@ -530,13 +527,14 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vpx_lpf_vertical_16_neon, &vpx_lpf_vertical_16_c, 8),
        make_tuple(&vpx_lpf_vertical_16_dual_neon, &vpx_lpf_vertical_16_dual_c,
                   8),
-#endif  // HAVE_NEON_ASM
        make_tuple(&vpx_lpf_horizontal_8_neon, &vpx_lpf_horizontal_8_c, 8),
        make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8),
        make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8),
        make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8)));
 INSTANTIATE_TEST_CASE_P(NEON, Loop8Test9Param,
                        ::testing::Values(
+// Using #if inside the macro is unsupported on MSVS but the tests are not
+// currently built for MSVS with ARM and NEON.
 #if HAVE_NEON_ASM
                            make_tuple(&vpx_lpf_horizontal_8_dual_neon,
                                       &vpx_lpf_horizontal_8_dual_c, 8),
--- a/vpx_dsp/arm/loopfilter_mb_neon.asm
+++ b/vpx_dsp/arm/loopfilter_mb_neon.asm
@ -11,6 +11,7 @@
    EXPORT  |vpx_lpf_horizontal_edge_8_neon|
    EXPORT  |vpx_lpf_horizontal_edge_16_neon|
    EXPORT  |vpx_lpf_vertical_16_neon|
+    EXPORT  |vpx_lpf_vertical_16_dual_neon|
    ARM

    AREA ||.text||, CODE, READONLY, ALIGN=2
@ -146,20 +147,21 @@ h_next
    b mb_lpf_horizontal_edge
    ENDP        ; |vpx_lpf_horizontal_edge_16_neon|

-; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
-;                               const uint8_t *blimit,
-;                               const uint8_t *limit,
-;                               const uint8_t *thresh)
+; void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+;                             const uint8_t *limit, const uint8_t *thresh,
+;                             int count) {
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vpx_lpf_vertical_16_neon| PROC
+; r12   int count
+|mb_lpf_vertical_edge_w| PROC
    push        {r4-r8, lr}
    vpush       {d8-d15}
    ldr         r4, [sp, #88]              ; load thresh

+v_count
    vld1.8      {d16[]}, [r2]              ; load *blimit
    vld1.8      {d17[]}, [r3]              ; load *limit
    vld1.8      {d18[]}, [r4]              ; load *thresh
@ -212,20 +214,21 @@ h_next

    ; flat && mask were not set for any of the channels. Just store the values
    ; from filter.
-    sub         r8, r0, #2
+    sub         r0, #2

    vswp        d23, d25

-    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
-    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
-    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
-    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
-    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
-    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
-    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
-    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
+    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r0], r1
+    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r0], r1
+    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r0], r1
+    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r0], r1
+    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r0], r1
+    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r0], r1
+    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r0], r1
+    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r0], r1
+    add         r0, #2

-    b           v_end
+    b           v_next

 v_mbfilter
    tst         r7, #2
@ -252,7 +255,7 @@ v_mbfilter
    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1

-    b           v_end
+    b           v_next

 v_wide_mbfilter
    sub         r8, r0, #8
@ -304,12 +307,40 @@ v_wide_mbfilter
    vst1.8      {d19}, [r8@64], r1
    vst1.8      {d15}, [r0@64], r1

-v_end
+v_next
+    subs        r12, #1
+    bne         v_count
+
    vpop        {d8-d15}
    pop         {r4-r8, pc}

+    ENDP        ; |mb_lpf_vertical_edge_w|
+
+; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+;                               const uint8_t *limit, const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh
+|vpx_lpf_vertical_16_neon| PROC
+    mov r12, #1
+    b mb_lpf_vertical_edge_w
    ENDP        ; |vpx_lpf_vertical_16_neon|

+; void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+;                                    const uint8_t *limit,
+;                                    const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh
+|vpx_lpf_vertical_16_dual_neon| PROC
+    mov r12, #2
+    b mb_lpf_vertical_edge_w
+    ENDP        ; |vpx_lpf_vertical_16_dual_neon|
+
 ; void vpx_wide_mbfilter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store.
--- a/vpx_dsp/arm/loopfilter_mb_neon.c
+++ b/vpx_dsp/arm/loopfilter_mb_neon.c
@ -0,0 +1,446 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+// Should we apply any filter at all: 11111111 yes, 00000000 no
+static INLINE uint8x8_t filter_mask(
+    const uint8x8_t limit, const uint8x8_t blimit, const uint8x8_t thresh,
+    const uint8x8_t p3, const uint8x8_t p2, const uint8x8_t p1,
+    const uint8x8_t p0, const uint8x8_t q0, const uint8x8_t q1,
+    const uint8x8_t q2, const uint8x8_t q3, uint8x8_t *flat, uint8x8_t *hev) {
+  uint8x8_t t0, t1;
+  uint8x8_t max = vabd_u8(p1, p0);
+  max = vmax_u8(max, vabd_u8(q1, q0));
+
+  // Is there high edge variance internal edge: 11111111 yes, 00000000 no
+  *hev = vcgt_u8(max, thresh);
+  *flat = vmax_u8(max, vabd_u8(p2, p0));
+  max = vmax_u8(max, vabd_u8(p3, p2));
+  max = vmax_u8(max, vabd_u8(p2, p1));
+  max = vmax_u8(max, vabd_u8(q2, q1));
+  max = vmax_u8(max, vabd_u8(q3, q2));
+  t0 = vabd_u8(p0, q0);
+  t1 = vabd_u8(p1, q1);
+  t0 = vqshl_n_u8(t0, 1);
+  t1 = vshr_n_u8(t1, 1);
+  t0 = vqadd_u8(t0, t1);
+  max = vcle_u8(max, limit);
+  t0 = vcle_u8(t0, blimit);
+  max = vand_u8(max, t0);
+
+  *flat = vmax_u8(*flat, vabd_u8(q2, q0));
+  *flat = vmax_u8(*flat, vabd_u8(p3, p0));
+  *flat = vmax_u8(*flat, vabd_u8(q3, q0));
+  *flat = vcle_u8(*flat, vdup_n_u8(1));  // flat_mask4()
+
+  return max;
+}
+
+static INLINE uint8x8_t flat_mask5(const uint8x8_t p4, const uint8x8_t p3,
+                                   const uint8x8_t p2, const uint8x8_t p1,
+                                   const uint8x8_t p0, const uint8x8_t q0,
+                                   const uint8x8_t q1, const uint8x8_t q2,
+                                   const uint8x8_t q3, const uint8x8_t q4) {
+  uint8x8_t max = vabd_u8(p4, p0);
+  max = vmax_u8(max, vabd_u8(p3, p0));
+  max = vmax_u8(max, vabd_u8(p2, p0));
+  max = vmax_u8(max, vabd_u8(p1, p0));
+  max = vmax_u8(max, vabd_u8(q1, q0));
+  max = vmax_u8(max, vabd_u8(q2, q0));
+  max = vmax_u8(max, vabd_u8(q3, q0));
+  max = vmax_u8(max, vabd_u8(q4, q0));
+  max = vcle_u8(max, vdup_n_u8(1));
+
+  return max;
+}
+
+static INLINE int8x8_t flip_sign(const uint8x8_t v) {
+  const uint8x8_t sign_bit = vdup_n_u8(0x80);
+  return vreinterpret_s8_u8(veor_u8(v, sign_bit));
+}
+
+static INLINE uint8x8_t flip_sign_back(const int8x8_t v) {
+  const int8x8_t sign_bit = vdup_n_s8(0x80);
+  return vreinterpret_u8_s8(veor_s8(v, sign_bit));
+}
+
+static INLINE uint8x8_t filter_tap7(const uint8x8_t flat, const uint8x8_t sub0,
+                                    const uint8x8_t sub1, const uint8x8_t add0,
+                                    const uint8x8_t add1, const uint8x8_t in,
+                                    uint16x8_t *sum) {
+  *sum = vsubw_u8(*sum, sub0);
+  *sum = vsubw_u8(*sum, sub1);
+  *sum = vaddw_u8(*sum, add0);
+  *sum = vaddw_u8(*sum, add1);
+  return vbsl_u8(flat, vrshrn_n_u16(*sum, 3), in);
+}
+
+static INLINE uint8x8_t filter_tap15(const uint8x8_t flat, const uint8x8_t sub0,
+                                     const uint8x8_t sub1, const uint8x8_t add0,
+                                     const uint8x8_t add1, const uint8x8_t in,
+                                     uint16x8_t *sum) {
+  *sum = vsubw_u8(*sum, sub0);
+  *sum = vsubw_u8(*sum, sub1);
+  *sum = vaddw_u8(*sum, add0);
+  *sum = vaddw_u8(*sum, add1);
+  return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in);
+}
+
+// 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+static INLINE void apply_7_tap_filter(const uint8x8_t flat, const uint8x8_t p3,
+                                      const uint8x8_t p2, const uint8x8_t p1,
+                                      const uint8x8_t p0, const uint8x8_t q0,
+                                      const uint8x8_t q1, const uint8x8_t q2,
+                                      const uint8x8_t q3, uint8x8_t *op2,
+                                      uint8x8_t *op1, uint8x8_t *op0,
+                                      uint8x8_t *oq0, uint8x8_t *oq1,
+                                      uint8x8_t *oq2) {
+  uint16x8_t sum;
+  sum = vaddl_u8(p3, p3);   // 2*p3
+  sum = vaddw_u8(sum, p3);  // 3*p3
+  sum = vaddw_u8(sum, p2);  // 3*p3+p2
+  sum = vaddw_u8(sum, p2);  // 3*p3+2*p2
+  sum = vaddw_u8(sum, p1);  // 3*p3+2*p2+p1
+  sum = vaddw_u8(sum, p0);  // 3*p3+2*p2+p1+p0
+  sum = vaddw_u8(sum, q0);  // 3*p3+2*p2+p1+p0+q0
+  *op2 = vbsl_u8(flat, vrshrn_n_u16(sum, 3), p2);
+  *op1 = filter_tap7(flat, p3, p2, p1, q1, *op1, &sum);
+  *op0 = filter_tap7(flat, p3, p1, p0, q2, *op0, &sum);
+  *oq0 = filter_tap7(flat, p3, p0, q0, q3, *oq0, &sum);
+  *oq1 = filter_tap7(flat, p2, q0, q1, q3, *oq1, &sum);
+  *oq2 = filter_tap7(flat, p1, q1, q2, q3, q2, &sum);
+}
+
+// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+static INLINE void apply_15_tap_filter(
+    const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6,
+    const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3,
+    const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0,
+    const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2,
+    const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5,
+    const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5,
+    uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1,
+    uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2,
+    uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) {
+  uint16x8_t sum;
+  sum = vshll_n_u8(p7, 3);  // 8*p7
+  sum = vsubw_u8(sum, p7);  // 7*p7
+  sum = vaddw_u8(sum, p6);  // 7*p7+p6
+  sum = vaddw_u8(sum, p6);  // 7*p7+2*p6
+  sum = vaddw_u8(sum, p5);  // 7*p7+2*p6+p5
+  sum = vaddw_u8(sum, p4);  // 7*p7+2*p6+p5+p4
+  sum = vaddw_u8(sum, p3);  // 7*p7+2*p6+p5+p4+p3
+  sum = vaddw_u8(sum, p2);  // 7*p7+2*p6+p5+p4+p3+p2
+  sum = vaddw_u8(sum, p1);  // 7*p7+2*p6+p5+p4+p3+p2+p1
+  sum = vaddw_u8(sum, p0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+  sum = vaddw_u8(sum, q0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+  *op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6);
+  *op5 = filter_tap15(flat2, p7, p6, p5, q1, p5, &sum);
+  *op4 = filter_tap15(flat2, p7, p5, p4, q2, p4, &sum);
+  *op3 = filter_tap15(flat2, p7, p4, p3, q3, p3, &sum);
+  *op2 = filter_tap15(flat2, p7, p3, p2, q4, *op2, &sum);
+  *op1 = filter_tap15(flat2, p7, p2, p1, q5, *op1, &sum);
+  *op0 = filter_tap15(flat2, p7, p1, p0, q6, *op0, &sum);
+  *oq0 = filter_tap15(flat2, p7, p0, q0, q7, *oq0, &sum);
+  *oq1 = filter_tap15(flat2, p6, q0, q1, q7, *oq1, &sum);
+  *oq2 = filter_tap15(flat2, p5, q1, q2, q7, *oq2, &sum);
+  *oq3 = filter_tap15(flat2, p4, q2, q3, q7, q3, &sum);
+  *oq4 = filter_tap15(flat2, p3, q3, q4, q7, q4, &sum);
+  *oq5 = filter_tap15(flat2, p2, q4, q5, q7, q5, &sum);
+  *oq6 = filter_tap15(flat2, p1, q5, q6, q7, q6, &sum);
+}
+
+static INLINE void filter16(
+    const uint8x8_t mask, const uint8x8_t flat, const uint64_t flat_u64,
+    const uint8x8_t flat2, const uint64_t flat2_u64, const uint8x8_t hev,
+    const uint8x8_t p7, const uint8x8_t p6, const uint8x8_t p5,
+    const uint8x8_t p4, const uint8x8_t p3, const uint8x8_t p2,
+    const uint8x8_t p1, const uint8x8_t p0, const uint8x8_t q0,
+    const uint8x8_t q1, const uint8x8_t q2, const uint8x8_t q3,
+    const uint8x8_t q4, const uint8x8_t q5, const uint8x8_t q6,
+    const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5, uint8x8_t *op4,
+    uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1, uint8x8_t *op0,
+    uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2, uint8x8_t *oq3,
+    uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) {
+  // add outer taps if we have high edge variance
+  if (flat_u64 != (uint64_t)-1) {
+    int8x8_t filter, filter1, filter2, t;
+    int8x8_t ps1 = flip_sign(p1);
+    int8x8_t ps0 = flip_sign(p0);
+    int8x8_t qs0 = flip_sign(q0);
+    int8x8_t qs1 = flip_sign(q1);
+
+    filter = vqsub_s8(ps1, qs1);
+    filter = vand_s8(filter, vreinterpret_s8_u8(hev));
+    t = vqsub_s8(qs0, ps0);
+
+    // inner taps
+    filter = vqadd_s8(filter, t);
+    filter = vqadd_s8(filter, t);
+    filter = vqadd_s8(filter, t);
+    filter = vand_s8(filter, vreinterpret_s8_u8(mask));
+
+    // save bottom 3 bits so that we round one side +4 and the other +3
+    // if it equals 4 we'll set to adjust by -1 to account for the fact
+    // we'd round 3 the other way
+    filter1 = vshr_n_s8(vqadd_s8(filter, vdup_n_s8(4)), 3);
+    filter2 = vshr_n_s8(vqadd_s8(filter, vdup_n_s8(3)), 3);
+
+    qs0 = vqsub_s8(qs0, filter1);
+    ps0 = vqadd_s8(ps0, filter2);
+    *oq0 = flip_sign_back(qs0);
+    *op0 = flip_sign_back(ps0);
+
+    // outer tap adjustments
+    filter = vrshr_n_s8(filter1, 1);
+    filter = vbic_s8(filter, vreinterpret_s8_u8(hev));
+
+    qs1 = vqsub_s8(qs1, filter);
+    ps1 = vqadd_s8(ps1, filter);
+    *oq1 = flip_sign_back(qs1);
+    *op1 = flip_sign_back(ps1);
+  }
+
+  if (flat_u64) {
+    *op2 = p2;
+    *oq2 = q2;
+    if (flat2_u64 != (uint64_t)-1) {
+      apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
+                         oq0, oq1, oq2);
+    }
+    if (flat2_u64) {
+      apply_15_tap_filter(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3,
+                          q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0,
+                          oq0, oq1, oq2, oq3, oq4, oq5, oq6);
+    }
+  }
+}
+
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int count) {
+  const uint8x8_t blimit_u8x8 = vld1_dup_u8(blimit);
+  const uint8x8_t limit_u8x8 = vld1_dup_u8(limit);
+  const uint8x8_t thresh_u8x8 = vld1_dup_u8(thresh);
+
+  do {
+    const uint8x8_t p7 = vld1_u8(s - 8 * p);
+    const uint8x8_t p6 = vld1_u8(s - 7 * p);
+    const uint8x8_t p5 = vld1_u8(s - 6 * p);
+    const uint8x8_t p4 = vld1_u8(s - 5 * p);
+    const uint8x8_t p3 = vld1_u8(s - 4 * p);
+    const uint8x8_t p2 = vld1_u8(s - 3 * p);
+    const uint8x8_t p1 = vld1_u8(s - 2 * p);
+    const uint8x8_t p0 = vld1_u8(s - 1 * p);
+    const uint8x8_t q0 = vld1_u8(s + 0 * p);
+    const uint8x8_t q1 = vld1_u8(s + 1 * p);
+    const uint8x8_t q2 = vld1_u8(s + 2 * p);
+    const uint8x8_t q3 = vld1_u8(s + 3 * p);
+    const uint8x8_t q4 = vld1_u8(s + 4 * p);
+    const uint8x8_t q5 = vld1_u8(s + 5 * p);
+    const uint8x8_t q6 = vld1_u8(s + 6 * p);
+    const uint8x8_t q7 = vld1_u8(s + 7 * p);
+    uint8x8_t op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5,
+        oq6, flat, hev;
+    const uint8x8_t mask = filter_mask(limit_u8x8, blimit_u8x8, thresh_u8x8, p3,
+                                       p2, p1, p0, q0, q1, q2, q3, &flat, &hev);
+    uint8x8_t flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
+    uint64_t flat_u64, flat2_u64;
+
+    flat = vand_u8(flat, mask);
+    flat2 = vand_u8(flat2, flat);
+    flat_u64 = vget_lane_u64(vreinterpret_u64_u8(flat), 0);
+    flat2_u64 = vget_lane_u64(vreinterpret_u64_u8(flat2), 0);
+
+    filter16(mask, flat, flat_u64, flat2, flat2_u64, hev, p7, p6, p5, p4, p3,
+             p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3,
+             &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6);
+
+    if (flat_u64) {
+      if (flat2_u64) {
+        vst1_u8(s - 7 * p, op6);
+        vst1_u8(s - 6 * p, op5);
+        vst1_u8(s - 5 * p, op4);
+        vst1_u8(s - 4 * p, op3);
+        vst1_u8(s + 3 * p, oq3);
+        vst1_u8(s + 4 * p, oq4);
+        vst1_u8(s + 5 * p, oq5);
+        vst1_u8(s + 6 * p, oq6);
+      }
+      vst1_u8(s - 3 * p, op2);
+      vst1_u8(s + 2 * p, oq2);
+    }
+    vst1_u8(s - 2 * p, op1);
+    vst1_u8(s - 1 * p, op0);
+    vst1_u8(s + 0 * p, oq0);
+    vst1_u8(s + 1 * p, oq1);
+    s += 8;
+  } while (--count);
+}
+
+void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh) {
+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+}
+
+void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh) {
+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
+}
+
+static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count) {
+  const uint8x8_t blimit_u8x8 = vld1_dup_u8(blimit);
+  const uint8x8_t limit_u8x8 = vld1_dup_u8(limit);
+  const uint8x8_t thresh_u8x8 = vld1_dup_u8(thresh);
+  uint8_t *d;
+
+  s -= 8;
+  d = s;
+  do {
+    uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
+    uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
+        op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6,
+        flat, hev, mask, flat2;
+    uint64_t flat_u64, flat2_u64;
+
+    t0 = vld1q_u8(s);
+    s += p;
+    t1 = vld1q_u8(s);
+    s += p;
+    t2 = vld1q_u8(s);
+    s += p;
+    t3 = vld1q_u8(s);
+    s += p;
+    t4 = vld1q_u8(s);
+    s += p;
+    t5 = vld1q_u8(s);
+    s += p;
+    t6 = vld1q_u8(s);
+    s += p;
+    t7 = vld1q_u8(s);
+    s += p;
+
+    transpose_u8_16x8(t0, t1, t2, t3, t4, t5, t6, t7, &p7, &p6, &p5, &p4, &p3,
+                      &p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+
+    mask = filter_mask(limit_u8x8, blimit_u8x8, thresh_u8x8, p3, p2, p1, p0, q0,
+                       q1, q2, q3, &flat, &hev);
+    flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
+    flat = vand_u8(flat, mask);
+    flat2 = vand_u8(flat2, flat);
+    flat_u64 = vget_lane_u64(vreinterpret_u64_u8(flat), 0);
+    flat2_u64 = vget_lane_u64(vreinterpret_u64_u8(flat2), 0);
+
+    filter16(mask, flat, flat_u64, flat2, flat2_u64, hev, p7, p6, p5, p4, p3,
+             p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3,
+             &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6);
+
+    if (flat_u64) {
+      if (flat2_u64) {
+        uint8x16_t o0, o1, o2, o3, o4, o5, o6, o7;
+        transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
+                          oq3, oq4, oq5, oq6, q7, &o0, &o1, &o2, &o3, &o4, &o5,
+                          &o6, &o7);
+
+        vst1q_u8(d, o0);
+        d += p;
+        vst1q_u8(d, o1);
+        d += p;
+        vst1q_u8(d, o2);
+        d += p;
+        vst1q_u8(d, o3);
+        d += p;
+        vst1q_u8(d, o4);
+        d += p;
+        vst1q_u8(d, o5);
+        d += p;
+        vst1q_u8(d, o6);
+        d += p;
+        vst1q_u8(d, o7);
+        d += p;
+      } else {
+        uint8x8x3_t o0, o1;
+        d += 8;
+        o0.val[0] = op2;
+        o0.val[1] = op1;
+        o0.val[2] = op0;
+        o1.val[0] = oq0;
+        o1.val[1] = oq1;
+        o1.val[2] = oq2;
+        vst3_lane_u8(d - 3, o0, 0);
+        vst3_lane_u8(d + 0, o1, 0);
+        d += p;
+        vst3_lane_u8(d - 3, o0, 1);
+        vst3_lane_u8(d + 0, o1, 1);
+        d += p;
+        vst3_lane_u8(d - 3, o0, 2);
+        vst3_lane_u8(d + 0, o1, 2);
+        d += p;
+        vst3_lane_u8(d - 3, o0, 3);
+        vst3_lane_u8(d + 0, o1, 3);
+        d += p;
+        vst3_lane_u8(d - 3, o0, 4);
+        vst3_lane_u8(d + 0, o1, 4);
+        d += p;
+        vst3_lane_u8(d - 3, o0, 5);
+        vst3_lane_u8(d + 0, o1, 5);
+        d += p;
+        vst3_lane_u8(d - 3, o0, 6);
+        vst3_lane_u8(d + 0, o1, 6);
+        d += p;
+        vst3_lane_u8(d - 3, o0, 7);
+        vst3_lane_u8(d + 0, o1, 7);
+        d += p - 8;
+      }
+    } else {
+      uint8x8x4_t o;
+      d += 6;
+      o.val[0] = op1;
+      o.val[1] = op0;
+      o.val[2] = oq0;
+      o.val[3] = oq1;
+      vst4_lane_u8(d, o, 0);
+      d += p;
+      vst4_lane_u8(d, o, 1);
+      d += p;
+      vst4_lane_u8(d, o, 2);
+      d += p;
+      vst4_lane_u8(d, o, 3);
+      d += p;
+      vst4_lane_u8(d, o, 4);
+      d += p;
+      vst4_lane_u8(d, o, 5);
+      d += p;
+      vst4_lane_u8(d, o, 6);
+      d += p;
+      vst4_lane_u8(d, o, 7);
+      d += p - 6;
+    }
+  } while (--count);
+}
+
+void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 1);
+}
+
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh) {
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 2);
+}
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@ -38,11 +38,4 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 }
-
-void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit,
-                                   const uint8_t *thresh) {
-  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
-  vpx_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
-}
 #endif  // HAVE_NEON_ASM
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@ -101,4 +101,219 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
  *a7 = d3.val[1];
 }

+static INLINE void transpose_u8_16x8(
+    const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
+    const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
+    const uint8x16_t i6, const uint8x16_t i7, uint8x8_t *o0, uint8x8_t *o1,
+    uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
+    uint8x8_t *o7, uint8x8_t *o8, uint8x8_t *o9, uint8x8_t *o10, uint8x8_t *o11,
+    uint8x8_t *o12, uint8x8_t *o13, uint8x8_t *o14, uint8x8_t *o15) {
+  // Input:
+  // i0: 00 01 02 03 04 05 06 07  08 09 0A 0B 0C 0D 0E 0F
+  // i1: 10 11 12 13 14 15 16 17  18 19 1A 1B 1C 1D 1E 1F
+  // i2: 20 21 22 23 24 25 26 27  28 29 2A 2B 2C 2D 2E 2F
+  // i3: 30 31 32 33 34 35 36 37  38 39 3A 3B 3C 3D 3E 3F
+  // i4: 40 41 42 43 44 45 46 47  48 49 4A 4B 4C 4D 4E 4F
+  // i5: 50 51 52 53 54 55 56 57  58 59 5A 5B 5C 5D 5E 5F
+  // i6: 60 61 62 63 64 65 66 67  68 69 6A 6B 6C 6D 6E 6F
+  // i7: 70 71 72 73 74 75 76 77  78 79 7A 7B 7C 7D 7E 7F
+  uint8x16x2_t b0, b1, b2, b3;
+  uint16x8x2_t c0, c1, c2, c3;
+  uint32x4x2_t d0, d1, d2, d3;
+
+  // b0: 00 10 02 12 04 14 06 16  08 18 0A 1A 0C 1C 0E 1E
+  //     01 11 03 13 05 15 07 17  09 19 0B 1B 0D 1D 0F 1F
+  // b1: 20 30 22 32 24 34 26 36  28 38 2A 3A 2C 3C 2E 3E
+  //     21 31 23 33 25 35 27 37  29 39 2B 3B 2D 3D 2F 3F
+  // b2: 40 50 42 52 44 54 46 56  48 58 4A 5A 4C 5C 4E 5E
+  //     41 51 43 53 45 55 47 57  49 59 4B 5B 4D 5D 4F 5F
+  // b3: 60 70 62 72 64 74 66 76  68 78 6A 7A 6C 7C 6E 7E
+  //     61 71 63 73 65 75 67 77  69 79 6B 7B 6D 7D 6F 7F
+  b0 = vtrnq_u8(i0, i1);
+  b1 = vtrnq_u8(i2, i3);
+  b2 = vtrnq_u8(i4, i5);
+  b3 = vtrnq_u8(i6, i7);
+
+  // c0: 00 10 20 30 04 14 24 34  08 18 28 38 0C 1C 2C 3C
+  //     02 12 22 32 06 16 26 36  0A 1A 2A 3A 0E 1E 2E 3E
+  // c1: 01 11 21 31 05 15 25 35  09 19 29 39 0D 1D 2D 3D
+  //     03 13 23 33 07 17 27 37  0B 1B 2B 3B 0F 1F 2F 3F
+  // c2: 40 50 60 70 44 54 64 74  48 58 68 78 4C 5C 6C 7C
+  //     42 52 62 72 46 56 66 76  4A 5A 6A 7A 4E 5E 6E 7E
+  // c3: 41 51 61 71 45 55 65 75  49 59 69 79 4D 5D 6D 7D
+  //     43 53 63 73 47 57 67 77  4B 5B 6B 7B 4F 5F 6F 7F
+  c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                 vreinterpretq_u16_u8(b1.val[0]));
+  c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                 vreinterpretq_u16_u8(b1.val[1]));
+  c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+                 vreinterpretq_u16_u8(b3.val[0]));
+  c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+                 vreinterpretq_u16_u8(b3.val[1]));
+
+  // d0: 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
+  //     04 14 24 34 44 54 64 74  0C 1C 2C 3C 4C 5C 6C 7C
+  // d1: 02 12 22 32 42 52 62 72  0A 1A 2A 3A 4A 5A 6A 7A
+  //     06 16 26 36 46 56 66 76  0E 1E 2E 3E 4E 5E 6E 7E
+  // d2: 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
+  //     05 15 25 35 45 55 65 75  0D 1D 2D 3D 4D 5D 6D 7D
+  // d3: 03 13 23 33 43 53 63 73  0B 1B 2B 3B 4B 5B 6B 7B
+  //     07 17 27 37 47 57 67 77  0F 1F 2F 3F 4F 5F 6F 7F
+  d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                 vreinterpretq_u32_u16(c2.val[0]));
+  d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                 vreinterpretq_u32_u16(c2.val[1]));
+  d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+                 vreinterpretq_u32_u16(c3.val[0]));
+  d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+                 vreinterpretq_u32_u16(c3.val[1]));
+
+  // Output:
+  // o0 : 00 10 20 30 40 50 60 70
+  // o1 : 01 11 21 31 41 51 61 71
+  // o2 : 02 12 22 32 42 52 62 72
+  // o3 : 03 13 23 33 43 53 63 73
+  // o4 : 04 14 24 34 44 54 64 74
+  // o5 : 05 15 25 35 45 55 65 75
+  // o6 : 06 16 26 36 46 56 66 76
+  // o7 : 07 17 27 37 47 57 67 77
+  // o8 : 08 18 28 38 48 58 68 78
+  // o9 : 09 19 29 39 49 59 69 79
+  // o10: 0A 1A 2A 3A 4A 5A 6A 7A
+  // o11: 0B 1B 2B 3B 4B 5B 6B 7B
+  // o12: 0C 1C 2C 3C 4C 5C 6C 7C
+  // o13: 0D 1D 2D 3D 4D 5D 6D 7D
+  // o14: 0E 1E 2E 3E 4E 5E 6E 7E
+  // o15: 0F 1F 2F 3F 4F 5F 6F 7F
+  *o0 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0]));
+  *o1 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0]));
+  *o2 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0]));
+  *o3 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0]));
+  *o4 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1]));
+  *o5 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1]));
+  *o6 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1]));
+  *o7 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1]));
+  *o8 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0]));
+  *o9 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0]));
+  *o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0]));
+  *o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0]));
+  *o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1]));
+  *o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1]));
+  *o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1]));
+  *o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1]));
+}
+
+static INLINE void transpose_u8_8x16(
+    const uint8x8_t i0, const uint8x8_t i1, const uint8x8_t i2,
+    const uint8x8_t i3, const uint8x8_t i4, const uint8x8_t i5,
+    const uint8x8_t i6, const uint8x8_t i7, const uint8x8_t i8,
+    const uint8x8_t i9, const uint8x8_t i10, const uint8x8_t i11,
+    const uint8x8_t i12, const uint8x8_t i13, const uint8x8_t i14,
+    const uint8x8_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
+    uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
+    uint8x16_t *o7) {
+  // Input:
+  // i0 : 00 01 02 03 04 05 06 07
+  // i1 : 10 11 12 13 14 15 16 17
+  // i2 : 20 21 22 23 24 25 26 27
+  // i3 : 30 31 32 33 34 35 36 37
+  // i4 : 40 41 42 43 44 45 46 47
+  // i5 : 50 51 52 53 54 55 56 57
+  // i6 : 60 61 62 63 64 65 66 67
+  // i7 : 70 71 72 73 74 75 76 77
+  // i8 : 80 81 82 83 84 85 86 87
+  // i9 : 90 91 92 93 94 95 96 97
+  // i10: A0 A1 A2 A3 A4 A5 A6 A7
+  // i11: B0 B1 B2 B3 B4 B5 B6 B7
+  // i12: C0 C1 C2 C3 C4 C5 C6 C7
+  // i13: D0 D1 D2 D3 D4 D5 D6 D7
+  // i14: E0 E1 E2 E3 E4 E5 E6 E7
+  // i15: F0 F1 F2 F3 F4 F5 F6 F7
+  uint8x16x2_t b0, b1, b2, b3;
+  uint16x8x2_t c0, c1, c2, c3;
+  uint32x4x2_t d0, d1, d2, d3;
+
+  // b0: 00 01 02 03 04 05 06 07  80 81 82 83 84 85 86 87
+  //     10 11 12 13 14 15 16 17  90 91 92 93 94 95 96 97
+  // b1: 20 21 22 23 24 25 26 27  A0 A1 A2 A3 A4 A5 A6 A7
+  //     30 31 32 33 34 35 36 37  B0 B1 B2 B3 B4 B5 B6 B7
+  // b2: 40 41 42 43 44 45 46 47  C0 C1 C2 C3 C4 C5 C6 C7
+  //     50 51 52 53 54 55 56 57  D0 D1 D2 D3 D4 D5 D6 D7
+  // b3: 60 61 62 63 64 65 66 67  E0 E1 E2 E3 E4 E5 E6 E7
+  //     70 71 72 73 74 75 76 77  F0 F1 F2 F3 F4 F5 F6 F7
+  b0.val[0] = vcombine_u8(i0, i8);
+  b0.val[1] = vcombine_u8(i1, i9);
+  b1.val[0] = vcombine_u8(i2, i10);
+  b1.val[1] = vcombine_u8(i3, i11);
+  b2.val[0] = vcombine_u8(i4, i12);
+  b2.val[1] = vcombine_u8(i5, i13);
+  b3.val[0] = vcombine_u8(i6, i14);
+  b3.val[1] = vcombine_u8(i7, i15);
+
+  // b0: 00 10 02 12 04 14 06 16  80 90 82 92 84 94 86 96
+  //     01 11 03 13 05 15 07 17  81 91 83 93 85 95 87 97
+  // b1: 20 30 22 32 24 34 26 36  A0 B0 A2 B2 A4 B4 A6 B6
+  //     21 31 23 33 25 35 27 37  A1 B1 A3 B3 A5 B5 A7 B7
+  // b2: 40 50 42 52 44 54 46 56  C0 D0 C2 D2 C4 D4 C6 D6
+  //     41 51 43 53 45 55 47 57  C1 D1 C3 D3 C5 D5 C7 D7
+  // b3: 60 70 62 72 64 74 66 76  E0 F0 E2 F2 E4 F4 E6 F6
+  //     61 71 63 73 65 75 67 77  E1 F1 E3 F3 E5 F5 E7 F7
+  b0 = vtrnq_u8(b0.val[0], b0.val[1]);
+  b1 = vtrnq_u8(b1.val[0], b1.val[1]);
+  b2 = vtrnq_u8(b2.val[0], b2.val[1]);
+  b3 = vtrnq_u8(b3.val[0], b3.val[1]);
+
+  // c0: 00 10 20 30 04 14 24 34  80 90 A0 B0 84 94 A4 B4
+  //     02 12 22 32 06 16 26 36  82 92 A2 B2 86 96 A6 B6
+  // c1: 01 11 21 31 05 15 25 35  81 91 A1 B1 85 95 A5 B5
+  //     03 13 23 33 07 17 27 37  83 93 A3 B3 87 97 A7 B7
+  // c2: 40 50 60 70 44 54 64 74  C0 D0 E0 F0 C4 D4 E4 F4
+  //     42 52 62 72 46 56 66 76  C2 D2 E2 F2 C6 D6 E6 F6
+  // c3: 41 51 61 71 45 55 65 75  C1 D1 E1 F1 C5 D5 E5 F5
+  //     43 53 63 73 47 57 67 77  C3 D3 E3 F3 C7 D7 E7 F7
+  c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                 vreinterpretq_u16_u8(b1.val[0]));
+  c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                 vreinterpretq_u16_u8(b1.val[1]));
+  c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+                 vreinterpretq_u16_u8(b3.val[0]));
+  c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+                 vreinterpretq_u16_u8(b3.val[1]));
+
+  // d0: 00 10 20 30 40 50 60 70  80 90 A0 B0 C0 D0 E0 F0
+  //     04 14 24 34 44 54 64 74  84 94 A4 B4 C4 D4 E4 F4
+  // d1: 02 12 22 32 42 52 62 72  82 92 A2 B2 C2 D2 E2 F2
+  //     06 16 26 36 46 56 66 76  86 96 A6 B6 C6 D6 E6 F6
+  // d2: 01 11 21 31 41 51 61 71  81 91 A1 B1 C1 D1 E1 F1
+  //     05 15 25 35 45 55 65 75  85 95 A5 B5 C5 D5 E5 F5
+  // d3: 03 13 23 33 43 53 63 73  83 93 A3 B3 C3 D3 E3 F3
+  //     07 17 27 37 47 57 67 77  87 97 A7 B7 C7 D7 E7 F7
+  d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                 vreinterpretq_u32_u16(c2.val[0]));
+  d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                 vreinterpretq_u32_u16(c2.val[1]));
+  d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+                 vreinterpretq_u32_u16(c3.val[0]));
+  d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+                 vreinterpretq_u32_u16(c3.val[1]));
+
+  // Output:
+  // o0: 00 10 20 30 40 50 60 70  80 90 A0 B0 C0 D0 E0 F0
+  // o1: 01 11 21 31 41 51 61 71  81 91 A1 B1 C1 D1 E1 F1
+  // o2: 02 12 22 32 42 52 62 72  82 92 A2 B2 C2 D2 E2 F2
+  // o3: 03 13 23 33 43 53 63 73  83 93 A3 B3 C3 D3 E3 F3
+  // o4: 04 14 24 34 44 54 64 74  84 94 A4 B4 C4 D4 E4 F4
+  // o5: 05 15 25 35 45 55 65 75  85 95 A5 B5 C5 D5 E5 F5
+  // o6: 06 16 26 36 46 56 66 76  86 96 A6 B6 C6 D6 E6 F6
+  // o7: 07 17 27 37 47 57 67 77  87 97 A7 B7 C7 D7 E7 F7
+  *o0 = vreinterpretq_u8_u32(d0.val[0]);
+  *o1 = vreinterpretq_u8_u32(d2.val[0]);
+  *o2 = vreinterpretq_u8_u32(d1.val[0]);
+  *o3 = vreinterpretq_u8_u32(d3.val[0]);
+  *o4 = vreinterpretq_u8_u32(d0.val[1]);
+  *o5 = vreinterpretq_u8_u32(d2.val[1]);
+  *o6 = vreinterpretq_u8_u32(d1.val[1]);
+  *o7 = vreinterpretq_u8_u32(d3.val[1]);
+}
+
 #endif  // VPX_DSP_ARM_TRANSPOSE_NEON_H_
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@ -30,7 +30,7 @@ static INLINE int16_t signed_char_clamp_high(int t, int bd) {
 }
 #endif

-// should we apply any filter at all: 11111111 yes, 00000000 no
+// Should we apply any filter at all: 11111111 yes, 00000000 no
 static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
                                 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
                                 uint8_t q1, uint8_t q2, uint8_t q3) {
@ -68,7 +68,7 @@ static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
  return ~mask;
 }

-// is there high edge variance internal edge: 11111111 yes, 00000000 no
+// Is there high edge variance internal edge: 11111111 yes, 00000000 no
 static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
                              uint8_t q0, uint8_t q1) {
  int8_t hev = 0;
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@ -144,6 +144,7 @@ DSP_SRCS-yes  += arm/loopfilter_8_neon$(ASM)
 DSP_SRCS-yes  += arm/loopfilter_4_neon$(ASM)
 else
 ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes   += arm/loopfilter_mb_neon.c
 DSP_SRCS-yes   += arm/loopfilter_16_neon.c
 DSP_SRCS-yes   += arm/loopfilter_8_neon.c
 DSP_SRCS-yes   += arm/loopfilter_4_neon.c
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@ -505,12 +505,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 # Loopfilter
 #
 add_proto qw/void vpx_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_16 sse2 neon_asm dspr2 msa/;
-$vpx_lpf_vertical_16_neon_asm=vpx_lpf_vertical_16_neon;
+specialize qw/vpx_lpf_vertical_16 sse2 neon dspr2 msa/;

 add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
-$vpx_lpf_vertical_16_dual_neon_asm=vpx_lpf_vertical_16_dual_neon;
+specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa/;

 add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
@ -526,12 +524,10 @@ add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_
 specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;

 add_proto qw/void vpx_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/;
-$vpx_lpf_horizontal_edge_8_neon_asm=vpx_lpf_horizontal_edge_8_neon;
+specialize qw/vpx_lpf_horizontal_edge_8 sse2 avx2 neon dspr2 msa/;

 add_proto qw/void vpx_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/;
-$vpx_lpf_horizontal_edge_16_neon_asm=vpx_lpf_horizontal_edge_16_neon;
+specialize qw/vpx_lpf_horizontal_edge_16 sse2 avx2 neon dspr2 msa/;

 add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;