Refactor lpf (size 4 and 8) NEON intrinsics optimization

Also check in 8x8 8-bit transpose NEON intrinsics optimization
transpose_u8_8x8()

Change-Id: I32d321cf97ea21eab158ac4896990fc9a51681c4
This commit is contained in:
Linfeng Zhang 2016-09-19 11:37:16 -07:00
parent aa0eb67bf7
commit 761e5ec2f6
6 changed files with 438 additions and 995 deletions

View File

@ -1,173 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit
uint8x16_t qlimit, // limit
uint8x16_t qthresh, // thresh
uint8x16_t q3, // p3
uint8x16_t q4, // p2
uint8x16_t q5, // p1
uint8x16_t q6, // p0
uint8x16_t q7, // q0
uint8x16_t q8, // q1
uint8x16_t q9, // q2
uint8x16_t q10, // q3
uint8x16_t *q5r, // p1
uint8x16_t *q6r, // p0
uint8x16_t *q7r, // q0
uint8x16_t *q8r) { // q1
uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int16x8_t q2s16, q11s16;
uint16x8_t q4u16;
int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
int8x8_t d2s8, d3s8;
q11u8 = vabdq_u8(q3, q4);
q12u8 = vabdq_u8(q4, q5);
q13u8 = vabdq_u8(q5, q6);
q14u8 = vabdq_u8(q8, q7);
q3 = vabdq_u8(q9, q8);
q4 = vabdq_u8(q10, q9);
q11u8 = vmaxq_u8(q11u8, q12u8);
q12u8 = vmaxq_u8(q13u8, q14u8);
q3 = vmaxq_u8(q3, q4);
q15u8 = vmaxq_u8(q11u8, q12u8);
q9 = vabdq_u8(q6, q7);
// vp8_hevmask
q13u8 = vcgtq_u8(q13u8, qthresh);
q14u8 = vcgtq_u8(q14u8, qthresh);
q15u8 = vmaxq_u8(q15u8, q3);
q2u8 = vabdq_u8(q5, q8);
q9 = vqaddq_u8(q9, q9);
q15u8 = vcgeq_u8(qlimit, q15u8);
// vp8_filter() function
// convert to signed
q10 = vdupq_n_u8(0x80);
q8 = veorq_u8(q8, q10);
q7 = veorq_u8(q7, q10);
q6 = veorq_u8(q6, q10);
q5 = veorq_u8(q5, q10);
q2u8 = vshrq_n_u8(q2u8, 1);
q9 = vqaddq_u8(q9, q2u8);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
vget_low_s8(vreinterpretq_s8_u8(q6)));
q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
vget_high_s8(vreinterpretq_s8_u8(q6)));
q9 = vcgeq_u8(qblimit, q9);
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
q14u8 = vorrq_u8(q13u8, q14u8);
q4u16 = vdupq_n_u16(3);
q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
q15u8 = vandq_u8(q15u8, q9);
q1s8 = vreinterpretq_s8_u8(q1u8);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
q4 = vdupq_n_u8(3);
q9 = vdupq_n_u8(4);
// vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
d2s8 = vqmovn_s16(q2s16);
d3s8 = vqmovn_s16(q11s16);
q1s8 = vcombine_s8(d2s8, d3s8);
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
q1s8 = vreinterpretq_s8_u8(q1u8);
q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
q2s8 = vshrq_n_s8(q2s8, 3);
q1s8 = vshrq_n_s8(q1s8, 3);
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
q1s8 = vrshrq_n_s8(q1s8, 1);
q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
*q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
*q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
*q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
*q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
return;
}
void vpx_lpf_horizontal_4_dual_neon(
uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
const uint8_t *limit1, const uint8_t *thresh1) {
uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
uint8x16_t qblimit, qlimit, qthresh;
uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
dblimit0 = vld1_u8(blimit0);
dlimit0 = vld1_u8(limit0);
dthresh0 = vld1_u8(thresh0);
dblimit1 = vld1_u8(blimit1);
dlimit1 = vld1_u8(limit1);
dthresh1 = vld1_u8(thresh1);
qblimit = vcombine_u8(dblimit0, dblimit1);
qlimit = vcombine_u8(dlimit0, dlimit1);
qthresh = vcombine_u8(dthresh0, dthresh1);
s -= (p << 2);
q3u8 = vld1q_u8(s);
s += p;
q4u8 = vld1q_u8(s);
s += p;
q5u8 = vld1q_u8(s);
s += p;
q6u8 = vld1q_u8(s);
s += p;
q7u8 = vld1q_u8(s);
s += p;
q8u8 = vld1q_u8(s);
s += p;
q9u8 = vld1q_u8(s);
s += p;
q10u8 = vld1q_u8(s);
loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
s -= (p * 5);
vst1q_u8(s, q5u8);
s += p;
vst1q_u8(s, q6u8);
s += p;
vst1q_u8(s, q7u8);
s += p;
vst1q_u8(s, q8u8);
return;
}

View File

@ -1,249 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_dsp_rtcd.h"
static INLINE void loop_filter_neon(uint8x8_t dblimit, // flimit
uint8x8_t dlimit, // limit
uint8x8_t dthresh, // thresh
uint8x8_t d3u8, // p3
uint8x8_t d4u8, // p2
uint8x8_t d5u8, // p1
uint8x8_t d6u8, // p0
uint8x8_t d7u8, // q0
uint8x8_t d16u8, // q1
uint8x8_t d17u8, // q2
uint8x8_t d18u8, // q3
uint8x8_t *d4ru8, // p1
uint8x8_t *d5ru8, // p0
uint8x8_t *d6ru8, // q0
uint8x8_t *d7ru8) { // q1
uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
int16x8_t q12s16;
int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
d19u8 = vabd_u8(d3u8, d4u8);
d20u8 = vabd_u8(d4u8, d5u8);
d21u8 = vabd_u8(d5u8, d6u8);
d22u8 = vabd_u8(d16u8, d7u8);
d3u8 = vabd_u8(d17u8, d16u8);
d4u8 = vabd_u8(d18u8, d17u8);
d19u8 = vmax_u8(d19u8, d20u8);
d20u8 = vmax_u8(d21u8, d22u8);
d3u8 = vmax_u8(d3u8, d4u8);
d23u8 = vmax_u8(d19u8, d20u8);
d17u8 = vabd_u8(d6u8, d7u8);
d21u8 = vcgt_u8(d21u8, dthresh);
d22u8 = vcgt_u8(d22u8, dthresh);
d23u8 = vmax_u8(d23u8, d3u8);
d28u8 = vabd_u8(d5u8, d16u8);
d17u8 = vqadd_u8(d17u8, d17u8);
d23u8 = vcge_u8(dlimit, d23u8);
d18u8 = vdup_n_u8(0x80);
d5u8 = veor_u8(d5u8, d18u8);
d6u8 = veor_u8(d6u8, d18u8);
d7u8 = veor_u8(d7u8, d18u8);
d16u8 = veor_u8(d16u8, d18u8);
d28u8 = vshr_n_u8(d28u8, 1);
d17u8 = vqadd_u8(d17u8, d28u8);
d19u8 = vdup_n_u8(3);
d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
d17u8 = vcge_u8(dblimit, d17u8);
d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
d22u8 = vorr_u8(d21u8, d22u8);
q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
d23u8 = vand_u8(d23u8, d17u8);
q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
d17u8 = vdup_n_u8(4);
d27s8 = vqmovn_s16(q12s16);
d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
d27s8 = vreinterpret_s8_u8(d27u8);
d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
d28s8 = vshr_n_s8(d28s8, 3);
d27s8 = vshr_n_s8(d27s8, 3);
d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
d27s8 = vrshr_n_s8(d27s8, 1);
d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
*d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
*d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
*d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
*d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
return;
}
void vpx_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
uint8_t *s, *psrc;
uint8x8_t dblimit, dlimit, dthresh;
uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
dblimit = vld1_u8(blimit);
dlimit = vld1_u8(limit);
dthresh = vld1_u8(thresh);
psrc = src - (pitch << 2);
for (i = 0; i < 1; i++) {
s = psrc + i * 8;
d3u8 = vld1_u8(s);
s += pitch;
d4u8 = vld1_u8(s);
s += pitch;
d5u8 = vld1_u8(s);
s += pitch;
d6u8 = vld1_u8(s);
s += pitch;
d7u8 = vld1_u8(s);
s += pitch;
d16u8 = vld1_u8(s);
s += pitch;
d17u8 = vld1_u8(s);
s += pitch;
d18u8 = vld1_u8(s);
loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
s -= (pitch * 5);
vst1_u8(s, d4u8);
s += pitch;
vst1_u8(s, d5u8);
s += pitch;
vst1_u8(s, d6u8);
s += pitch;
vst1_u8(s, d7u8);
}
return;
}
void vpx_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i, pitch8;
uint8_t *s;
uint8x8_t dblimit, dlimit, dthresh;
uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
uint8x8x4_t d4Result;
dblimit = vld1_u8(blimit);
dlimit = vld1_u8(limit);
dthresh = vld1_u8(thresh);
pitch8 = pitch * 8;
for (i = 0; i < 1; i++, src += pitch8) {
s = src - (i + 1) * 4;
d3u8 = vld1_u8(s);
s += pitch;
d4u8 = vld1_u8(s);
s += pitch;
d5u8 = vld1_u8(s);
s += pitch;
d6u8 = vld1_u8(s);
s += pitch;
d7u8 = vld1_u8(s);
s += pitch;
d16u8 = vld1_u8(s);
s += pitch;
d17u8 = vld1_u8(s);
s += pitch;
d18u8 = vld1_u8(s);
d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
vreinterpret_u16_u32(d2tmp2.val[0]));
d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
vreinterpret_u16_u32(d2tmp3.val[0]));
d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
vreinterpret_u16_u32(d2tmp2.val[1]));
d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
vreinterpret_u16_u32(d2tmp3.val[1]));
d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
vreinterpret_u8_u16(d2tmp5.val[0]));
d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
vreinterpret_u8_u16(d2tmp5.val[1]));
d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
vreinterpret_u8_u16(d2tmp7.val[0]));
d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
vreinterpret_u8_u16(d2tmp7.val[1]));
d3u8 = d2tmp8.val[0];
d4u8 = d2tmp8.val[1];
d5u8 = d2tmp9.val[0];
d6u8 = d2tmp9.val[1];
d7u8 = d2tmp10.val[0];
d16u8 = d2tmp10.val[1];
d17u8 = d2tmp11.val[0];
d18u8 = d2tmp11.val[1];
loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
d4Result.val[0] = d4u8;
d4Result.val[1] = d5u8;
d4Result.val[2] = d6u8;
d4Result.val[3] = d7u8;
src -= 2;
vst4_lane_u8(src, d4Result, 0);
src += pitch;
vst4_lane_u8(src, d4Result, 1);
src += pitch;
vst4_lane_u8(src, d4Result, 2);
src += pitch;
vst4_lane_u8(src, d4Result, 3);
src += pitch;
vst4_lane_u8(src, d4Result, 4);
src += pitch;
vst4_lane_u8(src, d4Result, 5);
src += pitch;
vst4_lane_u8(src, d4Result, 6);
src += pitch;
vst4_lane_u8(src, d4Result, 7);
}
return;
}

View File

@ -1,445 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_dsp_rtcd.h"
static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit
uint8x8_t dlimit, // limit
uint8x8_t dthresh, // thresh
uint8x8_t d3u8, // p2
uint8x8_t d4u8, // p2
uint8x8_t d5u8, // p1
uint8x8_t d6u8, // p0
uint8x8_t d7u8, // q0
uint8x8_t d16u8, // q1
uint8x8_t d17u8, // q2
uint8x8_t d18u8, // q3
uint8x8_t *d0ru8, // p1
uint8x8_t *d1ru8, // p1
uint8x8_t *d2ru8, // p0
uint8x8_t *d3ru8, // q0
uint8x8_t *d4ru8, // q1
uint8x8_t *d5ru8) { // q1
uint32_t flat;
uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
int16x8_t q15s16;
uint16x8_t q10u16, q14u16;
int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
d19u8 = vabd_u8(d3u8, d4u8);
d20u8 = vabd_u8(d4u8, d5u8);
d21u8 = vabd_u8(d5u8, d6u8);
d22u8 = vabd_u8(d16u8, d7u8);
d23u8 = vabd_u8(d17u8, d16u8);
d24u8 = vabd_u8(d18u8, d17u8);
d19u8 = vmax_u8(d19u8, d20u8);
d20u8 = vmax_u8(d21u8, d22u8);
d25u8 = vabd_u8(d6u8, d4u8);
d23u8 = vmax_u8(d23u8, d24u8);
d26u8 = vabd_u8(d7u8, d17u8);
d19u8 = vmax_u8(d19u8, d20u8);
d24u8 = vabd_u8(d6u8, d7u8);
d27u8 = vabd_u8(d3u8, d6u8);
d28u8 = vabd_u8(d18u8, d7u8);
d19u8 = vmax_u8(d19u8, d23u8);
d23u8 = vabd_u8(d5u8, d16u8);
d24u8 = vqadd_u8(d24u8, d24u8);
d19u8 = vcge_u8(dlimit, d19u8);
d25u8 = vmax_u8(d25u8, d26u8);
d26u8 = vmax_u8(d27u8, d28u8);
d23u8 = vshr_n_u8(d23u8, 1);
d25u8 = vmax_u8(d25u8, d26u8);
d24u8 = vqadd_u8(d24u8, d23u8);
d20u8 = vmax_u8(d20u8, d25u8);
d23u8 = vdup_n_u8(1);
d24u8 = vcge_u8(dblimit, d24u8);
d21u8 = vcgt_u8(d21u8, dthresh);
d20u8 = vcge_u8(d23u8, d20u8);
d19u8 = vand_u8(d19u8, d24u8);
d23u8 = vcgt_u8(d22u8, dthresh);
d20u8 = vand_u8(d20u8, d19u8);
d22u8 = vdup_n_u8(0x80);
d23u8 = vorr_u8(d21u8, d23u8);
q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
d30u8 = vshrn_n_u16(q10u16, 4);
flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
d27u8 = vdup_n_u8(3);
d21u8 = vdup_n_u8(2);
q14u16 = vaddl_u8(d6u8, d7u8);
q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
q14u16 = vaddw_u8(q14u16, d5u8);
*d0ru8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d3u8);
q14u16 = vsubw_u8(q14u16, d4u8);
q14u16 = vaddw_u8(q14u16, d5u8);
q14u16 = vaddw_u8(q14u16, d16u8);
*d1ru8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d3u8);
q14u16 = vsubw_u8(q14u16, d5u8);
q14u16 = vaddw_u8(q14u16, d6u8);
q14u16 = vaddw_u8(q14u16, d17u8);
*d2ru8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d3u8);
q14u16 = vsubw_u8(q14u16, d6u8);
q14u16 = vaddw_u8(q14u16, d7u8);
q14u16 = vaddw_u8(q14u16, d18u8);
*d3ru8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d4u8);
q14u16 = vsubw_u8(q14u16, d7u8);
q14u16 = vaddw_u8(q14u16, d16u8);
q14u16 = vaddw_u8(q14u16, d18u8);
*d4ru8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d5u8);
q14u16 = vsubw_u8(q14u16, d16u8);
q14u16 = vaddw_u8(q14u16, d17u8);
q14u16 = vaddw_u8(q14u16, d18u8);
*d5ru8 = vqrshrn_n_u16(q14u16, 3);
} else {
d21u8 = veor_u8(d7u8, d22u8);
d24u8 = veor_u8(d6u8, d22u8);
d25u8 = veor_u8(d5u8, d22u8);
d26u8 = veor_u8(d16u8, d22u8);
d27u8 = vdup_n_u8(3);
d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
q15s16 = vaddw_s8(q15s16, d29s8);
d29u8 = vdup_n_u8(4);
d28s8 = vqmovn_s16(q15s16);
d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
d30s8 = vshr_n_s8(d30s8, 3);
d29s8 = vshr_n_s8(d29s8, 3);
d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
d29s8 = vrshr_n_s8(d29s8, 1);
d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
if (flat == 0) { // filter_branch_only
*d0ru8 = d4u8;
*d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
*d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
*d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
*d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
*d5ru8 = d17u8;
return;
}
d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
d23u8 = vdup_n_u8(2);
q14u16 = vaddl_u8(d6u8, d7u8);
q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
q14u16 = vaddw_u8(q14u16, d5u8);
d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
d30u8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d3u8);
q14u16 = vsubw_u8(q14u16, d4u8);
q14u16 = vaddw_u8(q14u16, d5u8);
q14u16 = vaddw_u8(q14u16, d16u8);
d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
d31u8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d3u8);
q14u16 = vsubw_u8(q14u16, d5u8);
q14u16 = vaddw_u8(q14u16, d6u8);
q14u16 = vaddw_u8(q14u16, d17u8);
*d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
d23u8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d3u8);
q14u16 = vsubw_u8(q14u16, d6u8);
q14u16 = vaddw_u8(q14u16, d7u8);
*d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
q14u16 = vaddw_u8(q14u16, d18u8);
*d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
d22u8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d4u8);
q14u16 = vsubw_u8(q14u16, d7u8);
q14u16 = vaddw_u8(q14u16, d16u8);
d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
q14u16 = vaddw_u8(q14u16, d18u8);
d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
d6u8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d5u8);
q14u16 = vsubw_u8(q14u16, d16u8);
q14u16 = vaddw_u8(q14u16, d17u8);
q14u16 = vaddw_u8(q14u16, d18u8);
d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
d7u8 = vqrshrn_n_u16(q14u16, 3);
*d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
*d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
*d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
}
return;
}
void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
uint8_t *s, *psrc;
uint8x8_t dblimit, dlimit, dthresh;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
uint8x8_t d16u8, d17u8, d18u8;
dblimit = vld1_u8(blimit);
dlimit = vld1_u8(limit);
dthresh = vld1_u8(thresh);
psrc = src - (pitch << 2);
for (i = 0; i < 1; i++) {
s = psrc + i * 8;
d3u8 = vld1_u8(s);
s += pitch;
d4u8 = vld1_u8(s);
s += pitch;
d5u8 = vld1_u8(s);
s += pitch;
d6u8 = vld1_u8(s);
s += pitch;
d7u8 = vld1_u8(s);
s += pitch;
d16u8 = vld1_u8(s);
s += pitch;
d17u8 = vld1_u8(s);
s += pitch;
d18u8 = vld1_u8(s);
mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
&d5u8);
s -= (pitch * 6);
vst1_u8(s, d0u8);
s += pitch;
vst1_u8(s, d1u8);
s += pitch;
vst1_u8(s, d2u8);
s += pitch;
vst1_u8(s, d3u8);
s += pitch;
vst1_u8(s, d4u8);
s += pitch;
vst1_u8(s, d5u8);
}
return;
}
void vpx_lpf_horizontal_8_dual_neon(
uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
const uint8_t *limit1, const uint8_t *thresh1) {
vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
}
void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
uint8_t *s;
uint8x8_t dblimit, dlimit, dthresh;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
uint8x8_t d16u8, d17u8, d18u8;
uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
uint8x8x4_t d4Result;
uint8x8x2_t d2Result;
dblimit = vld1_u8(blimit);
dlimit = vld1_u8(limit);
dthresh = vld1_u8(thresh);
for (i = 0; i < 1; i++) {
s = src + (i * (pitch << 3)) - 4;
d3u8 = vld1_u8(s);
s += pitch;
d4u8 = vld1_u8(s);
s += pitch;
d5u8 = vld1_u8(s);
s += pitch;
d6u8 = vld1_u8(s);
s += pitch;
d7u8 = vld1_u8(s);
s += pitch;
d16u8 = vld1_u8(s);
s += pitch;
d17u8 = vld1_u8(s);
s += pitch;
d18u8 = vld1_u8(s);
d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
vreinterpret_u16_u32(d2tmp2.val[0]));
d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
vreinterpret_u16_u32(d2tmp3.val[0]));
d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
vreinterpret_u16_u32(d2tmp2.val[1]));
d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
vreinterpret_u16_u32(d2tmp3.val[1]));
d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
vreinterpret_u8_u16(d2tmp5.val[0]));
d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
vreinterpret_u8_u16(d2tmp5.val[1]));
d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
vreinterpret_u8_u16(d2tmp7.val[0]));
d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
vreinterpret_u8_u16(d2tmp7.val[1]));
d3u8 = d2tmp8.val[0];
d4u8 = d2tmp8.val[1];
d5u8 = d2tmp9.val[0];
d6u8 = d2tmp9.val[1];
d7u8 = d2tmp10.val[0];
d16u8 = d2tmp10.val[1];
d17u8 = d2tmp11.val[0];
d18u8 = d2tmp11.val[1];
mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
&d5u8);
d4Result.val[0] = d0u8;
d4Result.val[1] = d1u8;
d4Result.val[2] = d2u8;
d4Result.val[3] = d3u8;
d2Result.val[0] = d4u8;
d2Result.val[1] = d5u8;
s = src - 3;
vst4_lane_u8(s, d4Result, 0);
s += pitch;
vst4_lane_u8(s, d4Result, 1);
s += pitch;
vst4_lane_u8(s, d4Result, 2);
s += pitch;
vst4_lane_u8(s, d4Result, 3);
s += pitch;
vst4_lane_u8(s, d4Result, 4);
s += pitch;
vst4_lane_u8(s, d4Result, 5);
s += pitch;
vst4_lane_u8(s, d4Result, 6);
s += pitch;
vst4_lane_u8(s, d4Result, 7);
s = src + 1;
vst2_lane_u8(s, d2Result, 0);
s += pitch;
vst2_lane_u8(s, d2Result, 1);
s += pitch;
vst2_lane_u8(s, d2Result, 2);
s += pitch;
vst2_lane_u8(s, d2Result, 3);
s += pitch;
vst2_lane_u8(s, d2Result, 4);
s += pitch;
vst2_lane_u8(s, d2Result, 5);
s += pitch;
vst2_lane_u8(s, d2Result, 6);
s += pitch;
vst2_lane_u8(s, d2Result, 7);
}
return;
}
void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
}

View File

@ -31,6 +31,15 @@ FUN_LOAD_THRESH(8, _) // load_thresh_8
FUN_LOAD_THRESH(16, q_) // load_thresh_16
#undef FUN_LOAD_THRESH
static INLINE void load_thresh_8_dual(
const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,
uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) {
*blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1));
*limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1));
*thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1));
}
// Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a
// pixel. When used to control filter branches, we only detect whether it is all
// 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
@ -56,6 +65,38 @@ static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) {
return calc_flat_status_8(flat_4bit);
}
#define FUN_FILTER_HEV_MASK4(w, r) \
static INLINE uint8x##w##_t filter_hev_mask4_##w( \
const uint8x##w##_t limit, const uint8x##w##_t blimit, \
const uint8x##w##_t thresh, const uint8x##w##_t p3, \
const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) { \
uint8x##w##_t max, t0, t1; \
\
max = vabd##r##u8(p1, p0); \
max = vmax##r##u8(max, vabd##r##u8(q1, q0)); \
*hev = vcgt##r##u8(max, thresh); \
*mask = vmax##r##u8(max, vabd##r##u8(p3, p2)); \
*mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1)); \
*mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1)); \
*mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2)); \
t0 = vabd##r##u8(p0, q0); \
t1 = vabd##r##u8(p1, q1); \
t0 = vqadd##r##u8(t0, t0); \
t1 = vshr##r##n_u8(t1, 1); \
t0 = vqadd##r##u8(t0, t1); \
*mask = vcle##r##u8(*mask, limit); \
t0 = vcle##r##u8(t0, blimit); \
*mask = vand##r##u8(*mask, t0); \
\
return max; \
}
FUN_FILTER_HEV_MASK4(8, _) // filter_hev_mask4_8
FUN_FILTER_HEV_MASK4(16, q_) // filter_hev_mask4_16
#undef FUN_FILTER_HEV_MASK4
#define FUN_FILTER_FLAT_HEV_MASK(w, r) \
static INLINE uint8x##w##_t filter_flat_hev_mask_##w( \
const uint8x##w##_t limit, const uint8x##w##_t blimit, \
@ -64,25 +105,11 @@ static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) {
const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status, \
uint8x##w##_t *hev) { \
uint8x##w##_t t0, t1, mask; \
\
mask = vabd##r##u8(p1, p0); \
mask = vmax##r##u8(mask, vabd##r##u8(q1, q0)); \
*hev = vcgt##r##u8(mask, thresh); \
*flat = vmax##r##u8(mask, vabd##r##u8(p2, p0)); \
mask = vmax##r##u8(mask, vabd##r##u8(p3, p2)); \
mask = vmax##r##u8(mask, vabd##r##u8(p2, p1)); \
mask = vmax##r##u8(mask, vabd##r##u8(q2, q1)); \
mask = vmax##r##u8(mask, vabd##r##u8(q3, q2)); \
t0 = vabd##r##u8(p0, q0); \
t1 = vabd##r##u8(p1, q1); \
t0 = vqadd##r##u8(t0, t0); \
t1 = vshr##r##n_u8(t1, 1); \
t0 = vqadd##r##u8(t0, t1); \
mask = vcle##r##u8(mask, limit); \
t0 = vcle##r##u8(t0, blimit); \
mask = vand##r##u8(mask, t0); \
uint8x##w##_t max, mask; \
\
max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \
q2, q3, hev, &mask); \
*flat = vmax##r##u8(max, vabd##r##u8(p2, p0)); \
*flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0)); \
*flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0)); \
*flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0)); \
@ -420,6 +447,33 @@ FUN_FILTER4(8, _) // filter4_8
FUN_FILTER4(16, q_) // filter4_16
#undef FUN_FILTER4
#define FUN_FILTER8(w) \
static INLINE void filter8_##w( \
const uint8x##w##_t mask, const uint8x##w##_t flat, \
const uint32_t flat_status, const uint8x##w##_t hev, \
const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1, \
const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1, \
const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2, \
uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \
uint8x##w##_t *oq1, uint8x##w##_t *oq2) { \
if (flat_status != (uint32_t)-2) { \
filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \
*op2 = p2; \
*oq2 = q2; \
if (flat_status) { \
apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
op0, oq0, oq1, oq2); \
} \
} else { \
calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, \
oq0, oq1, oq2); \
} \
}
FUN_FILTER8(8) // filter8_8
FUN_FILTER8(16) // filter8_16
#undef FUN_FILTER8
#define FUN_FILTER16(w) \
static INLINE void filter16_##w( \
const uint8x##w##_t mask, const uint8x##w##_t flat, \
@ -481,6 +535,7 @@ FUN_FILTER16(16) // filter16_16
*q3 = vld1##r##u8(s); \
}
FUN_LOAD8(8, _) // load_8x8
FUN_LOAD8(16, q_) // load_16x8
#undef FUN_LOAD8
@ -529,6 +584,71 @@ FUN_LOAD16(8, _) // load_8x16
FUN_LOAD16(16, q_) // load_16x16
#undef FUN_LOAD16
#define FUN_STORE4(w, r) \
static INLINE void store_##w##x4( \
uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
const uint8x##w##_t s2, const uint8x##w##_t s3) { \
vst1##r##u8(s, s0); \
s += p; \
vst1##r##u8(s, s1); \
s += p; \
vst1##r##u8(s, s2); \
s += p; \
vst1##r##u8(s, s3); \
}
FUN_STORE4(8, _) // store_8x4
FUN_STORE4(16, q_) // store_16x4
#undef FUN_STORE4
#define FUN_STORE6(w, r) \
static INLINE void store_##w##x6( \
uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \
const uint8x##w##_t s5) { \
vst1##r##u8(s, s0); \
s += p; \
vst1##r##u8(s, s1); \
s += p; \
vst1##r##u8(s, s2); \
s += p; \
vst1##r##u8(s, s3); \
s += p; \
vst1##r##u8(s, s4); \
s += p; \
vst1##r##u8(s, s5); \
}
FUN_STORE6(8, _) // store_8x6
FUN_STORE6(16, q_) // store_16x6
#undef FUN_STORE6
static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
const uint8x8_t p0, const uint8x8_t q0,
const uint8x8_t q1) {
uint8x8x4_t o;
o.val[0] = p1;
o.val[1] = p0;
o.val[2] = q0;
o.val[3] = q1;
vst4_lane_u8(s, o, 0);
s += p;
vst4_lane_u8(s, o, 1);
s += p;
vst4_lane_u8(s, o, 2);
s += p;
vst4_lane_u8(s, o, 3);
s += p;
vst4_lane_u8(s, o, 4);
s += p;
vst4_lane_u8(s, o, 5);
s += p;
vst4_lane_u8(s, o, 6);
s += p;
vst4_lane_u8(s, o, 7);
}
static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
const uint8x8_t s1, const uint8x8_t s2,
const uint8x8_t s3, const uint8x8_t s4,
@ -566,53 +686,64 @@ static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
vst3_lane_u8(s + 0, o1, 7);
}
static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
const uint8x8_t p0, const uint8x8_t q0,
const uint8x8_t q1) {
uint8x8x4_t o;
#define FUN_STORE8(w, r) \
static INLINE void store_##w##x8( \
uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \
const uint8x##w##_t s5, const uint8x##w##_t s6, \
const uint8x##w##_t s7) { \
vst1##r##u8(s, s0); \
s += p; \
vst1##r##u8(s, s1); \
s += p; \
vst1##r##u8(s, s2); \
s += p; \
vst1##r##u8(s, s3); \
s += p; \
vst1##r##u8(s, s4); \
s += p; \
vst1##r##u8(s, s5); \
s += p; \
vst1##r##u8(s, s6); \
s += p; \
vst1##r##u8(s, s7); \
}
o.val[0] = p1;
o.val[1] = p0;
o.val[2] = q0;
o.val[3] = q1;
vst4_lane_u8(s, o, 0);
s += p;
vst4_lane_u8(s, o, 1);
s += p;
vst4_lane_u8(s, o, 2);
s += p;
vst4_lane_u8(s, o, 3);
s += p;
vst4_lane_u8(s, o, 4);
s += p;
vst4_lane_u8(s, o, 5);
s += p;
vst4_lane_u8(s, o, 6);
s += p;
vst4_lane_u8(s, o, 7);
}
FUN_STORE8(8, _) // store_8x8
FUN_STORE8(16, q_) // store_16x8
#undef FUN_STORE8
static INLINE void store_16x8(uint8_t *s, const int p, const uint8x16_t s0,
const uint8x16_t s1, const uint8x16_t s2,
const uint8x16_t s3, const uint8x16_t s4,
const uint8x16_t s5, const uint8x16_t s6,
const uint8x16_t s7) {
vst1q_u8(s, s0);
s += p;
vst1q_u8(s, s1);
s += p;
vst1q_u8(s, s2);
s += p;
vst1q_u8(s, s3);
s += p;
vst1q_u8(s, s4);
s += p;
vst1q_u8(s, s5);
s += p;
vst1q_u8(s, s6);
s += p;
vst1q_u8(s, s7);
}
#define FUN_STORE14(w, r) \
static INLINE void store_##w##x14( \
uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
const uint32_t flat_status, const uint32_t flat2_status) { \
if (flat_status) { \
if (flat2_status) { \
vst1##r##u8(s - 7 * p, p6); \
vst1##r##u8(s - 6 * p, p5); \
vst1##r##u8(s - 5 * p, p4); \
vst1##r##u8(s - 4 * p, p3); \
vst1##r##u8(s + 3 * p, q3); \
vst1##r##u8(s + 4 * p, q4); \
vst1##r##u8(s + 5 * p, q5); \
vst1##r##u8(s + 6 * p, q6); \
} \
vst1##r##u8(s - 3 * p, p2); \
vst1##r##u8(s + 2 * p, q2); \
} \
vst1##r##u8(s - 2 * p, p1); \
vst1##r##u8(s - 1 * p, p0); \
vst1##r##u8(s + 0 * p, q0); \
vst1##r##u8(s + 1 * p, q1); \
}
FUN_STORE14(8, _) // store_8x14
FUN_STORE14(16, q_) // store_16x14
#undef FUN_STORE14
static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
const uint8x16_t s1, const uint8x16_t s2,
@ -656,37 +787,160 @@ static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
vst1q_u8(s, s15);
}
#define FUN_STORE14(w, r) \
static INLINE void store_##w##x14( \
uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
const uint32_t flat_status, const uint32_t flat2_status) { \
if (flat_status) { \
if (flat2_status) { \
vst1##r##u8(s - 7 * p, p6); \
vst1##r##u8(s - 6 * p, p5); \
vst1##r##u8(s - 5 * p, p4); \
vst1##r##u8(s - 4 * p, p3); \
vst1##r##u8(s + 3 * p, q3); \
vst1##r##u8(s + 4 * p, q4); \
vst1##r##u8(s + 5 * p, q5); \
vst1##r##u8(s + 6 * p, q6); \
} \
vst1##r##u8(s - 3 * p, p2); \
vst1##r##u8(s + 2 * p, q2); \
} \
vst1##r##u8(s - 2 * p, p1); \
vst1##r##u8(s - 1 * p, p0); \
vst1##r##u8(s + 0 * p, q0); \
vst1##r##u8(s + 1 * p, q1); \
#define FUN_HOR_4_KERNEL(name, w) \
static INLINE void lpf_horizontal_4##name##kernel( \
uint8_t *s, const int p, const uint8x##w##_t blimit, \
const uint8x##w##_t limit, const uint8x##w##_t thresh) { \
uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev; \
\
load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); \
filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \
q3, &hev, &mask); \
filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); \
store_##w##x4(s - 2 * p, p, p1, p0, q0, q1); \
}
FUN_STORE14(8, _) // store_8x14
FUN_STORE14(16, q_) // store_16x14
#undef FUN_STORE14
FUN_HOR_4_KERNEL(_, 8) // lpf_horizontal_4_kernel
FUN_HOR_4_KERNEL(_dual_, 16) // lpf_horizontal_4_dual_kernel
#undef FUN_HOR_4_KERNEL
void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
uint8x8_t blimit_vec, limit_vec, thresh_vec;
load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
}
void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *limit0,
const uint8_t *thresh0,
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
uint8x16_t blimit_vec, limit_vec, thresh_vec;
load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
&blimit_vec, &limit_vec, &thresh_vec);
lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
}
void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
mask, hev;
load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
q2, q3, &hev, &mask);
filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
store_4x8(s - 2, p, p1, p0, q0, q1);
}
void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
mask, hev;
uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15;
load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
&blimit_vec, &limit_vec, &thresh_vec);
load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
&s11, &s12, &s13, &s14, &s15);
transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
q2, q3, &hev, &mask);
filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
s -= 2;
store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0),
vget_low_u8(q1));
store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0),
vget_high_u8(q1));
}
void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
uint32_t flat_status;
load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
&op1, &op0, &oq0, &oq1, &oq2);
store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
}
void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *limit0,
const uint8_t *thresh0,
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
uint32_t flat_status;
load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
&blimit_vec, &limit_vec, &thresh_vec);
load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
&op1, &op0, &oq0, &oq1, &oq2);
store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
}
void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
uint32_t flat_status;
load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
&op1, &op0, &oq0, &oq1, &oq2);
// Note: tranpose + store_8x8() is faster than store_6x8().
transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
}
void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15;
uint32_t flat_status;
load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
&blimit_vec, &limit_vec, &thresh_vec);
load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
&s11, &s12, &s13, &s14, &s15);
transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
&op1, &op0, &oq0, &oq1, &oq2);
// Note: store_6x8() twice is faster than tranpose + store_8x16().
store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
vget_high_u8(oq2));
}
#define FUN_LPF_16_KERNEL(name, w) \
static INLINE void lpf_16##name##kernel( \
@ -784,7 +1038,9 @@ void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
&s6, &s7);
store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
} else {
store_6x8(s + 8, p, op2, op1, op0, oq0, oq1, oq2);
// Note: tranpose + store_8x8() is faster than store_6x8().
transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
}
} else {
store_4x8(s + 6, p, op1, op0, oq0, oq1);
@ -819,6 +1075,7 @@ void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
s13, s14, s15);
} else {
// Note: store_6x8() twice is faster than tranpose + store_8x16().
s += 8;
store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));

View File

@ -39,28 +39,84 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) {
return b0;
}
// Note: Using 'd' registers or 'q' registers has almost identical speed. We use
// 'q' registers here to save some instructions.
static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
uint8x8_t *a6, uint8x8_t *a7) {
// Swap 8 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
// a1: 10 11 12 13 14 15 16 17
// a2: 20 21 22 23 24 25 26 27
// a3: 30 31 32 33 34 35 36 37
// a4: 40 41 42 43 44 45 46 47
// a5: 50 51 52 53 54 55 56 57
// a6: 60 61 62 63 64 65 66 67
// a7: 70 71 72 73 74 75 76 77
// to:
// b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
// b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
// b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
// b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
const uint8x16x2_t b0 =
vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
const uint8x16x2_t b1 =
vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
// Swap 16 bit elements resulting in:
// c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
// c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
// c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
// c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
vreinterpretq_u16_u8(b1.val[0]));
const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
vreinterpretq_u16_u8(b1.val[1]));
// Unzip 32 bit elements resulting in:
// d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
// d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
// d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
// d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
vreinterpretq_u32_u16(c1.val[0]));
const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
vreinterpretq_u32_u16(c1.val[1]));
*a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
*a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
*a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
*a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
*a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
*a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
*a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
*a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
}
static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
int16x8_t *a2, int16x8_t *a3,
int16x8_t *a4, int16x8_t *a5,
int16x8_t *a6, int16x8_t *a7) {
// Swap 16 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
// a1: 08 09 10 11 12 13 14 15
// a2: 16 17 18 19 20 21 22 23
// a3: 24 25 26 27 28 29 30 31
// a4: 32 33 34 35 36 37 38 39
// a5: 40 41 42 43 44 45 46 47
// a6: 48 49 50 51 52 53 54 55
// a7: 56 57 58 59 60 61 62 63
// a1: 10 11 12 13 14 15 16 17
// a2: 20 21 22 23 24 25 26 27
// a3: 30 31 32 33 34 35 36 37
// a4: 40 41 42 43 44 45 46 47
// a5: 50 51 52 53 54 55 56 57
// a6: 60 61 62 63 64 65 66 67
// a7: 70 71 72 73 74 75 76 77
// to:
// b0.val[0]: 00 08 02 10 04 12 06 14
// b0.val[1]: 01 09 03 11 05 13 07 15
// b1.val[0]: 16 24 18 26 20 28 22 30
// b1.val[1]: 17 25 19 27 21 29 23 31
// b2.val[0]: 32 40 34 42 36 44 38 46
// b2.val[1]: 33 41 35 43 37 45 39 47
// b3.val[0]: 48 56 50 58 52 60 54 62
// b3.val[1]: 49 57 51 59 53 61 55 63
// b0.val[0]: 00 10 02 12 04 14 06 16
// b0.val[1]: 01 11 03 13 05 15 07 17
// b1.val[0]: 20 30 22 32 24 34 26 36
// b1.val[1]: 21 31 23 33 25 35 27 37
// b2.val[0]: 40 50 42 52 44 54 46 56
// b2.val[1]: 41 51 43 53 45 55 47 57
// b3.val[0]: 60 70 62 72 64 74 66 76
// b3.val[1]: 61 71 63 73 65 75 67 77
const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
@ -68,14 +124,14 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
// Swap 32 bit elements resulting in:
// c0.val[0]: 00 08 16 24 04 12 20 28
// c0.val[1]: 02 10 18 26 06 14 22 30
// c1.val[0]: 01 09 17 25 05 13 21 29
// c1.val[1]: 03 11 19 27 07 15 23 31
// c2.val[0]: 32 40 48 56 36 44 52 60
// c2.val[1]: 34 42 50 58 38 46 54 62
// c3.val[0]: 33 41 49 57 37 45 53 61
// c3.val[1]: 35 43 51 59 39 47 55 63
// c0.val[0]: 00 10 20 30 04 14 24 34
// c0.val[1]: 02 12 22 32 06 16 26 36
// c1.val[0]: 01 11 21 31 05 15 25 35
// c1.val[1]: 03 13 23 33 07 17 27 37
// c2.val[0]: 40 50 60 70 44 54 64 74
// c2.val[1]: 42 52 62 72 46 56 66 76
// c3.val[0]: 41 51 61 71 45 55 65 75
// c3.val[1]: 43 53 63 73 47 57 67 77
const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
vreinterpretq_s32_s16(b1.val[0]));
@ -87,14 +143,14 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
vreinterpretq_s32_s16(b3.val[1]));
// Swap 64 bit elements resulting in:
// d0.val[0]: 00 08 16 24 32 40 48 56
// d0.val[1]: 04 12 20 28 36 44 52 60
// d1.val[0]: 01 09 17 25 33 41 49 57
// d1.val[1]: 05 13 21 29 37 45 53 61
// d2.val[0]: 02 10 18 26 34 42 50 58
// d2.val[1]: 06 14 22 30 38 46 54 62
// d3.val[0]: 03 11 19 27 35 43 51 59
// d3.val[1]: 07 15 23 31 39 47 55 63
// d0.val[0]: 00 10 20 30 40 50 60 70
// d0.val[1]: 04 14 24 34 44 54 64 74
// d1.val[0]: 01 11 21 31 41 51 61 71
// d1.val[1]: 05 15 25 35 45 55 65 75
// d2.val[0]: 02 12 22 32 42 52 62 72
// d2.val[1]: 06 16 26 36 46 56 66 76
// d3.val[0]: 03 13 23 33 43 53 63 73
// d3.val[1]: 07 17 27 37 47 57 67 77
const int16x8x2_t d0 = vpx_vtrnq_s64(c0.val[0], c2.val[0]);
const int16x8x2_t d1 = vpx_vtrnq_s64(c1.val[0], c3.val[0]);
const int16x8x2_t d2 = vpx_vtrnq_s64(c0.val[1], c2.val[1]);

View File

@ -136,8 +136,8 @@ DSP_SRCS-yes += loopfilter.c
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c
DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c
DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c
ifeq ($(HAVE_NEON_ASM),yes)
DSP_SRCS-yes += arm/loopfilter_neon.c
DSP_SRCS-yes += arm/loopfilter_mb_neon$(ASM)
DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM)
DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM)
@ -145,9 +145,6 @@ DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM)
else
ifeq ($(HAVE_NEON),yes)
DSP_SRCS-yes += arm/loopfilter_mb_neon.c
DSP_SRCS-yes += arm/loopfilter_16_neon.c
DSP_SRCS-yes += arm/loopfilter_8_neon.c
DSP_SRCS-yes += arm/loopfilter_4_neon.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM