NEON intrinsics for 4 loopfilter functions

New NEON intrinsics functions:
vpx_lpf_horizontal_edge_8_neon()
vpx_lpf_horizontal_edge_16_neon()
vpx_lpf_vertical_16_neon()
vpx_lpf_vertical_16_dual_neon()

BUG=webm:1262, webm:1263, webm:1264, webm:1265.

Change-Id: I7a2aff2a358b22277429329adec606e08efbc8cb
This commit is contained in:
Linfeng Zhang 2016-08-03 11:42:33 -07:00
parent f1e12c1bf3
commit f09b5a3328
8 changed files with 718 additions and 38 deletions

View File

@ -520,9 +520,6 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
NEON, Loop8Test6Param,
::testing::Values(
// Using #if inside the macro is unsupported on MSVS but the tests are not
// currently built for MSVS with ARM and NEON.
#if HAVE_NEON_ASM
make_tuple(&vpx_lpf_horizontal_edge_8_neon,
&vpx_lpf_horizontal_edge_8_c, 8),
make_tuple(&vpx_lpf_horizontal_edge_16_neon,
@ -530,13 +527,14 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vpx_lpf_vertical_16_neon, &vpx_lpf_vertical_16_c, 8),
make_tuple(&vpx_lpf_vertical_16_dual_neon, &vpx_lpf_vertical_16_dual_c,
8),
#endif // HAVE_NEON_ASM
make_tuple(&vpx_lpf_horizontal_8_neon, &vpx_lpf_horizontal_8_c, 8),
make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8),
make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8),
make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8)));
INSTANTIATE_TEST_CASE_P(NEON, Loop8Test9Param,
::testing::Values(
// Using #if inside the macro is unsupported on MSVS but the tests are not
// currently built for MSVS with ARM and NEON.
#if HAVE_NEON_ASM
make_tuple(&vpx_lpf_horizontal_8_dual_neon,
&vpx_lpf_horizontal_8_dual_c, 8),

View File

@ -11,6 +11,7 @@
EXPORT |vpx_lpf_horizontal_edge_8_neon|
EXPORT |vpx_lpf_horizontal_edge_16_neon|
EXPORT |vpx_lpf_vertical_16_neon|
EXPORT |vpx_lpf_vertical_16_dual_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
@ -146,20 +147,21 @@ h_next
b mb_lpf_horizontal_edge
ENDP ; |vpx_lpf_horizontal_edge_16_neon|
; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
; const uint8_t *blimit,
; const uint8_t *limit,
; const uint8_t *thresh)
; void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
; const uint8_t *limit, const uint8_t *thresh,
; int count) {
; r0 uint8_t *s,
; r1 int p, /* pitch */
; r2 const uint8_t *blimit,
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
|vpx_lpf_vertical_16_neon| PROC
; r12 int count
|mb_lpf_vertical_edge_w| PROC
push {r4-r8, lr}
vpush {d8-d15}
ldr r4, [sp, #88] ; load thresh
v_count
vld1.8 {d16[]}, [r2] ; load *blimit
vld1.8 {d17[]}, [r3] ; load *limit
vld1.8 {d18[]}, [r4] ; load *thresh
@ -212,20 +214,21 @@ h_next
; flat && mask were not set for any of the channels. Just store the values
; from filter.
sub r8, r0, #2
sub r0, #2
vswp d23, d25
vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r0], r1
vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r0], r1
vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r0], r1
vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r0], r1
vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r0], r1
vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r0], r1
vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r0], r1
vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r0], r1
add r0, #2
b v_end
b v_next
v_mbfilter
tst r7, #2
@ -252,7 +255,7 @@ v_mbfilter
vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1
vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1
b v_end
b v_next
v_wide_mbfilter
sub r8, r0, #8
@ -304,12 +307,40 @@ v_wide_mbfilter
vst1.8 {d19}, [r8@64], r1
vst1.8 {d15}, [r0@64], r1
v_end
v_next
subs r12, #1
bne v_count
vpop {d8-d15}
pop {r4-r8, pc}
ENDP ; |mb_lpf_vertical_edge_w|
; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
; const uint8_t *limit, const uint8_t *thresh)
; r0 uint8_t *s,
; r1 int p, /* pitch */
; r2 const uint8_t *blimit,
; r3 const uint8_t *limit,
; sp const uint8_t *thresh
|vpx_lpf_vertical_16_neon| PROC
mov r12, #1
b mb_lpf_vertical_edge_w
ENDP ; |vpx_lpf_vertical_16_neon|
; void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
; const uint8_t *limit,
; const uint8_t *thresh)
; r0 uint8_t *s,
; r1 int p, /* pitch */
; r2 const uint8_t *blimit,
; r3 const uint8_t *limit,
; sp const uint8_t *thresh
|vpx_lpf_vertical_16_dual_neon| PROC
mov r12, #2
b mb_lpf_vertical_edge_w
ENDP ; |vpx_lpf_vertical_16_dual_neon|
; void vpx_wide_mbfilter_neon();
; This is a helper function for the loopfilters. The invidual functions do the
; necessary load, transpose (if necessary) and store.

View File

@ -0,0 +1,446 @@
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/transpose_neon.h"
// Should we apply any filter at all: 11111111 yes, 00000000 no
static INLINE uint8x8_t filter_mask(
const uint8x8_t limit, const uint8x8_t blimit, const uint8x8_t thresh,
const uint8x8_t p3, const uint8x8_t p2, const uint8x8_t p1,
const uint8x8_t p0, const uint8x8_t q0, const uint8x8_t q1,
const uint8x8_t q2, const uint8x8_t q3, uint8x8_t *flat, uint8x8_t *hev) {
uint8x8_t t0, t1;
uint8x8_t max = vabd_u8(p1, p0);
max = vmax_u8(max, vabd_u8(q1, q0));
// Is there high edge variance internal edge: 11111111 yes, 00000000 no
*hev = vcgt_u8(max, thresh);
*flat = vmax_u8(max, vabd_u8(p2, p0));
max = vmax_u8(max, vabd_u8(p3, p2));
max = vmax_u8(max, vabd_u8(p2, p1));
max = vmax_u8(max, vabd_u8(q2, q1));
max = vmax_u8(max, vabd_u8(q3, q2));
t0 = vabd_u8(p0, q0);
t1 = vabd_u8(p1, q1);
t0 = vqshl_n_u8(t0, 1);
t1 = vshr_n_u8(t1, 1);
t0 = vqadd_u8(t0, t1);
max = vcle_u8(max, limit);
t0 = vcle_u8(t0, blimit);
max = vand_u8(max, t0);
*flat = vmax_u8(*flat, vabd_u8(q2, q0));
*flat = vmax_u8(*flat, vabd_u8(p3, p0));
*flat = vmax_u8(*flat, vabd_u8(q3, q0));
*flat = vcle_u8(*flat, vdup_n_u8(1)); // flat_mask4()
return max;
}
static INLINE uint8x8_t flat_mask5(const uint8x8_t p4, const uint8x8_t p3,
const uint8x8_t p2, const uint8x8_t p1,
const uint8x8_t p0, const uint8x8_t q0,
const uint8x8_t q1, const uint8x8_t q2,
const uint8x8_t q3, const uint8x8_t q4) {
uint8x8_t max = vabd_u8(p4, p0);
max = vmax_u8(max, vabd_u8(p3, p0));
max = vmax_u8(max, vabd_u8(p2, p0));
max = vmax_u8(max, vabd_u8(p1, p0));
max = vmax_u8(max, vabd_u8(q1, q0));
max = vmax_u8(max, vabd_u8(q2, q0));
max = vmax_u8(max, vabd_u8(q3, q0));
max = vmax_u8(max, vabd_u8(q4, q0));
max = vcle_u8(max, vdup_n_u8(1));
return max;
}
static INLINE int8x8_t flip_sign(const uint8x8_t v) {
const uint8x8_t sign_bit = vdup_n_u8(0x80);
return vreinterpret_s8_u8(veor_u8(v, sign_bit));
}
static INLINE uint8x8_t flip_sign_back(const int8x8_t v) {
const int8x8_t sign_bit = vdup_n_s8(0x80);
return vreinterpret_u8_s8(veor_s8(v, sign_bit));
}
static INLINE uint8x8_t filter_tap7(const uint8x8_t flat, const uint8x8_t sub0,
const uint8x8_t sub1, const uint8x8_t add0,
const uint8x8_t add1, const uint8x8_t in,
uint16x8_t *sum) {
*sum = vsubw_u8(*sum, sub0);
*sum = vsubw_u8(*sum, sub1);
*sum = vaddw_u8(*sum, add0);
*sum = vaddw_u8(*sum, add1);
return vbsl_u8(flat, vrshrn_n_u16(*sum, 3), in);
}
static INLINE uint8x8_t filter_tap15(const uint8x8_t flat, const uint8x8_t sub0,
const uint8x8_t sub1, const uint8x8_t add0,
const uint8x8_t add1, const uint8x8_t in,
uint16x8_t *sum) {
*sum = vsubw_u8(*sum, sub0);
*sum = vsubw_u8(*sum, sub1);
*sum = vaddw_u8(*sum, add0);
*sum = vaddw_u8(*sum, add1);
return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in);
}
// 7-tap filter [1, 1, 1, 2, 1, 1, 1]
static INLINE void apply_7_tap_filter(const uint8x8_t flat, const uint8x8_t p3,
const uint8x8_t p2, const uint8x8_t p1,
const uint8x8_t p0, const uint8x8_t q0,
const uint8x8_t q1, const uint8x8_t q2,
const uint8x8_t q3, uint8x8_t *op2,
uint8x8_t *op1, uint8x8_t *op0,
uint8x8_t *oq0, uint8x8_t *oq1,
uint8x8_t *oq2) {
uint16x8_t sum;
sum = vaddl_u8(p3, p3); // 2*p3
sum = vaddw_u8(sum, p3); // 3*p3
sum = vaddw_u8(sum, p2); // 3*p3+p2
sum = vaddw_u8(sum, p2); // 3*p3+2*p2
sum = vaddw_u8(sum, p1); // 3*p3+2*p2+p1
sum = vaddw_u8(sum, p0); // 3*p3+2*p2+p1+p0
sum = vaddw_u8(sum, q0); // 3*p3+2*p2+p1+p0+q0
*op2 = vbsl_u8(flat, vrshrn_n_u16(sum, 3), p2);
*op1 = filter_tap7(flat, p3, p2, p1, q1, *op1, &sum);
*op0 = filter_tap7(flat, p3, p1, p0, q2, *op0, &sum);
*oq0 = filter_tap7(flat, p3, p0, q0, q3, *oq0, &sum);
*oq1 = filter_tap7(flat, p2, q0, q1, q3, *oq1, &sum);
*oq2 = filter_tap7(flat, p1, q1, q2, q3, q2, &sum);
}
// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
static INLINE void apply_15_tap_filter(
const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6,
const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3,
const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0,
const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2,
const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5,
const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5,
uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1,
uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2,
uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) {
uint16x8_t sum;
sum = vshll_n_u8(p7, 3); // 8*p7
sum = vsubw_u8(sum, p7); // 7*p7
sum = vaddw_u8(sum, p6); // 7*p7+p6
sum = vaddw_u8(sum, p6); // 7*p7+2*p6
sum = vaddw_u8(sum, p5); // 7*p7+2*p6+p5
sum = vaddw_u8(sum, p4); // 7*p7+2*p6+p5+p4
sum = vaddw_u8(sum, p3); // 7*p7+2*p6+p5+p4+p3
sum = vaddw_u8(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2
sum = vaddw_u8(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1
sum = vaddw_u8(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
sum = vaddw_u8(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
*op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6);
*op5 = filter_tap15(flat2, p7, p6, p5, q1, p5, &sum);
*op4 = filter_tap15(flat2, p7, p5, p4, q2, p4, &sum);
*op3 = filter_tap15(flat2, p7, p4, p3, q3, p3, &sum);
*op2 = filter_tap15(flat2, p7, p3, p2, q4, *op2, &sum);
*op1 = filter_tap15(flat2, p7, p2, p1, q5, *op1, &sum);
*op0 = filter_tap15(flat2, p7, p1, p0, q6, *op0, &sum);
*oq0 = filter_tap15(flat2, p7, p0, q0, q7, *oq0, &sum);
*oq1 = filter_tap15(flat2, p6, q0, q1, q7, *oq1, &sum);
*oq2 = filter_tap15(flat2, p5, q1, q2, q7, *oq2, &sum);
*oq3 = filter_tap15(flat2, p4, q2, q3, q7, q3, &sum);
*oq4 = filter_tap15(flat2, p3, q3, q4, q7, q4, &sum);
*oq5 = filter_tap15(flat2, p2, q4, q5, q7, q5, &sum);
*oq6 = filter_tap15(flat2, p1, q5, q6, q7, q6, &sum);
}
static INLINE void filter16(
const uint8x8_t mask, const uint8x8_t flat, const uint64_t flat_u64,
const uint8x8_t flat2, const uint64_t flat2_u64, const uint8x8_t hev,
const uint8x8_t p7, const uint8x8_t p6, const uint8x8_t p5,
const uint8x8_t p4, const uint8x8_t p3, const uint8x8_t p2,
const uint8x8_t p1, const uint8x8_t p0, const uint8x8_t q0,
const uint8x8_t q1, const uint8x8_t q2, const uint8x8_t q3,
const uint8x8_t q4, const uint8x8_t q5, const uint8x8_t q6,
const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5, uint8x8_t *op4,
uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1, uint8x8_t *op0,
uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2, uint8x8_t *oq3,
uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) {
// add outer taps if we have high edge variance
if (flat_u64 != (uint64_t)-1) {
int8x8_t filter, filter1, filter2, t;
int8x8_t ps1 = flip_sign(p1);
int8x8_t ps0 = flip_sign(p0);
int8x8_t qs0 = flip_sign(q0);
int8x8_t qs1 = flip_sign(q1);
filter = vqsub_s8(ps1, qs1);
filter = vand_s8(filter, vreinterpret_s8_u8(hev));
t = vqsub_s8(qs0, ps0);
// inner taps
filter = vqadd_s8(filter, t);
filter = vqadd_s8(filter, t);
filter = vqadd_s8(filter, t);
filter = vand_s8(filter, vreinterpret_s8_u8(mask));
// save bottom 3 bits so that we round one side +4 and the other +3
// if it equals 4 we'll set to adjust by -1 to account for the fact
// we'd round 3 the other way
filter1 = vshr_n_s8(vqadd_s8(filter, vdup_n_s8(4)), 3);
filter2 = vshr_n_s8(vqadd_s8(filter, vdup_n_s8(3)), 3);
qs0 = vqsub_s8(qs0, filter1);
ps0 = vqadd_s8(ps0, filter2);
*oq0 = flip_sign_back(qs0);
*op0 = flip_sign_back(ps0);
// outer tap adjustments
filter = vrshr_n_s8(filter1, 1);
filter = vbic_s8(filter, vreinterpret_s8_u8(hev));
qs1 = vqsub_s8(qs1, filter);
ps1 = vqadd_s8(ps1, filter);
*oq1 = flip_sign_back(qs1);
*op1 = flip_sign_back(ps1);
}
if (flat_u64) {
*op2 = p2;
*oq2 = q2;
if (flat2_u64 != (uint64_t)-1) {
apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
oq0, oq1, oq2);
}
if (flat2_u64) {
apply_15_tap_filter(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3,
q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0,
oq0, oq1, oq2, oq3, oq4, oq5, oq6);
}
}
}
static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int count) {
const uint8x8_t blimit_u8x8 = vld1_dup_u8(blimit);
const uint8x8_t limit_u8x8 = vld1_dup_u8(limit);
const uint8x8_t thresh_u8x8 = vld1_dup_u8(thresh);
do {
const uint8x8_t p7 = vld1_u8(s - 8 * p);
const uint8x8_t p6 = vld1_u8(s - 7 * p);
const uint8x8_t p5 = vld1_u8(s - 6 * p);
const uint8x8_t p4 = vld1_u8(s - 5 * p);
const uint8x8_t p3 = vld1_u8(s - 4 * p);
const uint8x8_t p2 = vld1_u8(s - 3 * p);
const uint8x8_t p1 = vld1_u8(s - 2 * p);
const uint8x8_t p0 = vld1_u8(s - 1 * p);
const uint8x8_t q0 = vld1_u8(s + 0 * p);
const uint8x8_t q1 = vld1_u8(s + 1 * p);
const uint8x8_t q2 = vld1_u8(s + 2 * p);
const uint8x8_t q3 = vld1_u8(s + 3 * p);
const uint8x8_t q4 = vld1_u8(s + 4 * p);
const uint8x8_t q5 = vld1_u8(s + 5 * p);
const uint8x8_t q6 = vld1_u8(s + 6 * p);
const uint8x8_t q7 = vld1_u8(s + 7 * p);
uint8x8_t op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5,
oq6, flat, hev;
const uint8x8_t mask = filter_mask(limit_u8x8, blimit_u8x8, thresh_u8x8, p3,
p2, p1, p0, q0, q1, q2, q3, &flat, &hev);
uint8x8_t flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
uint64_t flat_u64, flat2_u64;
flat = vand_u8(flat, mask);
flat2 = vand_u8(flat2, flat);
flat_u64 = vget_lane_u64(vreinterpret_u64_u8(flat), 0);
flat2_u64 = vget_lane_u64(vreinterpret_u64_u8(flat2), 0);
filter16(mask, flat, flat_u64, flat2, flat2_u64, hev, p7, p6, p5, p4, p3,
p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3,
&op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6);
if (flat_u64) {
if (flat2_u64) {
vst1_u8(s - 7 * p, op6);
vst1_u8(s - 6 * p, op5);
vst1_u8(s - 5 * p, op4);
vst1_u8(s - 4 * p, op3);
vst1_u8(s + 3 * p, oq3);
vst1_u8(s + 4 * p, oq4);
vst1_u8(s + 5 * p, oq5);
vst1_u8(s + 6 * p, oq6);
}
vst1_u8(s - 3 * p, op2);
vst1_u8(s + 2 * p, oq2);
}
vst1_u8(s - 2 * p, op1);
vst1_u8(s - 1 * p, op0);
vst1_u8(s + 0 * p, oq0);
vst1_u8(s + 1 * p, oq1);
s += 8;
} while (--count);
}
void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
}
void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
}
static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int count) {
const uint8x8_t blimit_u8x8 = vld1_dup_u8(blimit);
const uint8x8_t limit_u8x8 = vld1_dup_u8(limit);
const uint8x8_t thresh_u8x8 = vld1_dup_u8(thresh);
uint8_t *d;
s -= 8;
d = s;
do {
uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6,
flat, hev, mask, flat2;
uint64_t flat_u64, flat2_u64;
t0 = vld1q_u8(s);
s += p;
t1 = vld1q_u8(s);
s += p;
t2 = vld1q_u8(s);
s += p;
t3 = vld1q_u8(s);
s += p;
t4 = vld1q_u8(s);
s += p;
t5 = vld1q_u8(s);
s += p;
t6 = vld1q_u8(s);
s += p;
t7 = vld1q_u8(s);
s += p;
transpose_u8_16x8(t0, t1, t2, t3, t4, t5, t6, t7, &p7, &p6, &p5, &p4, &p3,
&p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
mask = filter_mask(limit_u8x8, blimit_u8x8, thresh_u8x8, p3, p2, p1, p0, q0,
q1, q2, q3, &flat, &hev);
flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
flat = vand_u8(flat, mask);
flat2 = vand_u8(flat2, flat);
flat_u64 = vget_lane_u64(vreinterpret_u64_u8(flat), 0);
flat2_u64 = vget_lane_u64(vreinterpret_u64_u8(flat2), 0);
filter16(mask, flat, flat_u64, flat2, flat2_u64, hev, p7, p6, p5, p4, p3,
p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3,
&op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6);
if (flat_u64) {
if (flat2_u64) {
uint8x16_t o0, o1, o2, o3, o4, o5, o6, o7;
transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
oq3, oq4, oq5, oq6, q7, &o0, &o1, &o2, &o3, &o4, &o5,
&o6, &o7);
vst1q_u8(d, o0);
d += p;
vst1q_u8(d, o1);
d += p;
vst1q_u8(d, o2);
d += p;
vst1q_u8(d, o3);
d += p;
vst1q_u8(d, o4);
d += p;
vst1q_u8(d, o5);
d += p;
vst1q_u8(d, o6);
d += p;
vst1q_u8(d, o7);
d += p;
} else {
uint8x8x3_t o0, o1;
d += 8;
o0.val[0] = op2;
o0.val[1] = op1;
o0.val[2] = op0;
o1.val[0] = oq0;
o1.val[1] = oq1;
o1.val[2] = oq2;
vst3_lane_u8(d - 3, o0, 0);
vst3_lane_u8(d + 0, o1, 0);
d += p;
vst3_lane_u8(d - 3, o0, 1);
vst3_lane_u8(d + 0, o1, 1);
d += p;
vst3_lane_u8(d - 3, o0, 2);
vst3_lane_u8(d + 0, o1, 2);
d += p;
vst3_lane_u8(d - 3, o0, 3);
vst3_lane_u8(d + 0, o1, 3);
d += p;
vst3_lane_u8(d - 3, o0, 4);
vst3_lane_u8(d + 0, o1, 4);
d += p;
vst3_lane_u8(d - 3, o0, 5);
vst3_lane_u8(d + 0, o1, 5);
d += p;
vst3_lane_u8(d - 3, o0, 6);
vst3_lane_u8(d + 0, o1, 6);
d += p;
vst3_lane_u8(d - 3, o0, 7);
vst3_lane_u8(d + 0, o1, 7);
d += p - 8;
}
} else {
uint8x8x4_t o;
d += 6;
o.val[0] = op1;
o.val[1] = op0;
o.val[2] = oq0;
o.val[3] = oq1;
vst4_lane_u8(d, o, 0);
d += p;
vst4_lane_u8(d, o, 1);
d += p;
vst4_lane_u8(d, o, 2);
d += p;
vst4_lane_u8(d, o, 3);
d += p;
vst4_lane_u8(d, o, 4);
d += p;
vst4_lane_u8(d, o, 5);
d += p;
vst4_lane_u8(d, o, 6);
d += p;
vst4_lane_u8(d, o, 7);
d += p - 6;
}
} while (--count);
}
void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 1);
}
void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 2);
}

View File

@ -38,11 +38,4 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
}
void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
vpx_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
}
#endif // HAVE_NEON_ASM

View File

@ -101,4 +101,219 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
*a7 = d3.val[1];
}
static INLINE void transpose_u8_16x8(
const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
const uint8x16_t i6, const uint8x16_t i7, uint8x8_t *o0, uint8x8_t *o1,
uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
uint8x8_t *o7, uint8x8_t *o8, uint8x8_t *o9, uint8x8_t *o10, uint8x8_t *o11,
uint8x8_t *o12, uint8x8_t *o13, uint8x8_t *o14, uint8x8_t *o15) {
// Input:
// i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
// i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
// i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
// i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
// i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
// i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
// i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
// i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
uint8x16x2_t b0, b1, b2, b3;
uint16x8x2_t c0, c1, c2, c3;
uint32x4x2_t d0, d1, d2, d3;
// b0: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
// 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
// b1: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
// 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
// b2: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
// 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
// b3: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
// 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
b0 = vtrnq_u8(i0, i1);
b1 = vtrnq_u8(i2, i3);
b2 = vtrnq_u8(i4, i5);
b3 = vtrnq_u8(i6, i7);
// c0: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
// 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
// c1: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
// 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
// c2: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
// 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
// c3: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
// 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
vreinterpretq_u16_u8(b1.val[0]));
c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
vreinterpretq_u16_u8(b1.val[1]));
c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
vreinterpretq_u16_u8(b3.val[0]));
c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
vreinterpretq_u16_u8(b3.val[1]));
// d0: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
// 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
// d1: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
// 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
// d2: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
// 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
// d3: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
// 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
vreinterpretq_u32_u16(c2.val[0]));
d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
vreinterpretq_u32_u16(c2.val[1]));
d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
vreinterpretq_u32_u16(c3.val[0]));
d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
vreinterpretq_u32_u16(c3.val[1]));
// Output:
// o0 : 00 10 20 30 40 50 60 70
// o1 : 01 11 21 31 41 51 61 71
// o2 : 02 12 22 32 42 52 62 72
// o3 : 03 13 23 33 43 53 63 73
// o4 : 04 14 24 34 44 54 64 74
// o5 : 05 15 25 35 45 55 65 75
// o6 : 06 16 26 36 46 56 66 76
// o7 : 07 17 27 37 47 57 67 77
// o8 : 08 18 28 38 48 58 68 78
// o9 : 09 19 29 39 49 59 69 79
// o10: 0A 1A 2A 3A 4A 5A 6A 7A
// o11: 0B 1B 2B 3B 4B 5B 6B 7B
// o12: 0C 1C 2C 3C 4C 5C 6C 7C
// o13: 0D 1D 2D 3D 4D 5D 6D 7D
// o14: 0E 1E 2E 3E 4E 5E 6E 7E
// o15: 0F 1F 2F 3F 4F 5F 6F 7F
*o0 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0]));
*o1 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0]));
*o2 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0]));
*o3 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0]));
*o4 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1]));
*o5 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1]));
*o6 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1]));
*o7 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1]));
*o8 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0]));
*o9 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0]));
*o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0]));
*o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0]));
*o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1]));
*o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1]));
*o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1]));
*o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1]));
}
static INLINE void transpose_u8_8x16(
const uint8x8_t i0, const uint8x8_t i1, const uint8x8_t i2,
const uint8x8_t i3, const uint8x8_t i4, const uint8x8_t i5,
const uint8x8_t i6, const uint8x8_t i7, const uint8x8_t i8,
const uint8x8_t i9, const uint8x8_t i10, const uint8x8_t i11,
const uint8x8_t i12, const uint8x8_t i13, const uint8x8_t i14,
const uint8x8_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
uint8x16_t *o7) {
// Input:
// i0 : 00 01 02 03 04 05 06 07
// i1 : 10 11 12 13 14 15 16 17
// i2 : 20 21 22 23 24 25 26 27
// i3 : 30 31 32 33 34 35 36 37
// i4 : 40 41 42 43 44 45 46 47
// i5 : 50 51 52 53 54 55 56 57
// i6 : 60 61 62 63 64 65 66 67
// i7 : 70 71 72 73 74 75 76 77
// i8 : 80 81 82 83 84 85 86 87
// i9 : 90 91 92 93 94 95 96 97
// i10: A0 A1 A2 A3 A4 A5 A6 A7
// i11: B0 B1 B2 B3 B4 B5 B6 B7
// i12: C0 C1 C2 C3 C4 C5 C6 C7
// i13: D0 D1 D2 D3 D4 D5 D6 D7
// i14: E0 E1 E2 E3 E4 E5 E6 E7
// i15: F0 F1 F2 F3 F4 F5 F6 F7
uint8x16x2_t b0, b1, b2, b3;
uint16x8x2_t c0, c1, c2, c3;
uint32x4x2_t d0, d1, d2, d3;
// b0: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87
// 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97
// b1: 20 21 22 23 24 25 26 27 A0 A1 A2 A3 A4 A5 A6 A7
// 30 31 32 33 34 35 36 37 B0 B1 B2 B3 B4 B5 B6 B7
// b2: 40 41 42 43 44 45 46 47 C0 C1 C2 C3 C4 C5 C6 C7
// 50 51 52 53 54 55 56 57 D0 D1 D2 D3 D4 D5 D6 D7
// b3: 60 61 62 63 64 65 66 67 E0 E1 E2 E3 E4 E5 E6 E7
// 70 71 72 73 74 75 76 77 F0 F1 F2 F3 F4 F5 F6 F7
b0.val[0] = vcombine_u8(i0, i8);
b0.val[1] = vcombine_u8(i1, i9);
b1.val[0] = vcombine_u8(i2, i10);
b1.val[1] = vcombine_u8(i3, i11);
b2.val[0] = vcombine_u8(i4, i12);
b2.val[1] = vcombine_u8(i5, i13);
b3.val[0] = vcombine_u8(i6, i14);
b3.val[1] = vcombine_u8(i7, i15);
// b0: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
// 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
// b1: 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6
// 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7
// b2: 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6
// 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7
// b3: 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6
// 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7
b0 = vtrnq_u8(b0.val[0], b0.val[1]);
b1 = vtrnq_u8(b1.val[0], b1.val[1]);
b2 = vtrnq_u8(b2.val[0], b2.val[1]);
b3 = vtrnq_u8(b3.val[0], b3.val[1]);
// c0: 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4
// 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6
// c1: 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5
// 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7
// c2: 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4
// 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6
// c3: 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5
// 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7
c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
vreinterpretq_u16_u8(b1.val[0]));
c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
vreinterpretq_u16_u8(b1.val[1]));
c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
vreinterpretq_u16_u8(b3.val[0]));
c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
vreinterpretq_u16_u8(b3.val[1]));
// d0: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
// 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
// d1: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
// 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
// d2: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
// 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
// d3: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
// 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
vreinterpretq_u32_u16(c2.val[0]));
d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
vreinterpretq_u32_u16(c2.val[1]));
d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
vreinterpretq_u32_u16(c3.val[0]));
d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
vreinterpretq_u32_u16(c3.val[1]));
// Output:
// o0: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
// o1: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
// o2: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
// o3: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
// o4: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
// o5: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
// o6: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
// o7: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
*o0 = vreinterpretq_u8_u32(d0.val[0]);
*o1 = vreinterpretq_u8_u32(d2.val[0]);
*o2 = vreinterpretq_u8_u32(d1.val[0]);
*o3 = vreinterpretq_u8_u32(d3.val[0]);
*o4 = vreinterpretq_u8_u32(d0.val[1]);
*o5 = vreinterpretq_u8_u32(d2.val[1]);
*o6 = vreinterpretq_u8_u32(d1.val[1]);
*o7 = vreinterpretq_u8_u32(d3.val[1]);
}
#endif // VPX_DSP_ARM_TRANSPOSE_NEON_H_

View File

@ -30,7 +30,7 @@ static INLINE int16_t signed_char_clamp_high(int t, int bd) {
}
#endif
// should we apply any filter at all: 11111111 yes, 00000000 no
// Should we apply any filter at all: 11111111 yes, 00000000 no
static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
uint8_t q1, uint8_t q2, uint8_t q3) {
@ -68,7 +68,7 @@ static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
return ~mask;
}
// is there high edge variance internal edge: 11111111 yes, 00000000 no
// Is there high edge variance internal edge: 11111111 yes, 00000000 no
static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
uint8_t q0, uint8_t q1) {
int8_t hev = 0;

View File

@ -144,6 +144,7 @@ DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM)
DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM)
else
ifeq ($(HAVE_NEON),yes)
DSP_SRCS-yes += arm/loopfilter_mb_neon.c
DSP_SRCS-yes += arm/loopfilter_16_neon.c
DSP_SRCS-yes += arm/loopfilter_8_neon.c
DSP_SRCS-yes += arm/loopfilter_4_neon.c

View File

@ -505,12 +505,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Loopfilter
#
add_proto qw/void vpx_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_vertical_16 sse2 neon_asm dspr2 msa/;
$vpx_lpf_vertical_16_neon_asm=vpx_lpf_vertical_16_neon;
specialize qw/vpx_lpf_vertical_16 sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
$vpx_lpf_vertical_16_dual_neon_asm=vpx_lpf_vertical_16_dual_neon;
specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
@ -526,12 +524,10 @@ add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_
specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/;
$vpx_lpf_horizontal_edge_8_neon_asm=vpx_lpf_horizontal_edge_8_neon;
specialize qw/vpx_lpf_horizontal_edge_8 sse2 avx2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/;
$vpx_lpf_horizontal_edge_16_neon_asm=vpx_lpf_horizontal_edge_16_neon;
specialize qw/vpx_lpf_horizontal_edge_16 sse2 avx2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;