Refactor lpf (size 4 and 8) NEON intrinsics optimization
Also check in 8x8 8-bit transpose NEON intrinsics optimization transpose_u8_8x8() Change-Id: I32d321cf97ea21eab158ac4896990fc9a51681c4
This commit is contained in:
parent
aa0eb67bf7
commit
761e5ec2f6
@ -1,173 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "./vpx_config.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit
|
||||
uint8x16_t qlimit, // limit
|
||||
uint8x16_t qthresh, // thresh
|
||||
uint8x16_t q3, // p3
|
||||
uint8x16_t q4, // p2
|
||||
uint8x16_t q5, // p1
|
||||
uint8x16_t q6, // p0
|
||||
uint8x16_t q7, // q0
|
||||
uint8x16_t q8, // q1
|
||||
uint8x16_t q9, // q2
|
||||
uint8x16_t q10, // q3
|
||||
uint8x16_t *q5r, // p1
|
||||
uint8x16_t *q6r, // p0
|
||||
uint8x16_t *q7r, // q0
|
||||
uint8x16_t *q8r) { // q1
|
||||
uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
|
||||
int16x8_t q2s16, q11s16;
|
||||
uint16x8_t q4u16;
|
||||
int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
|
||||
int8x8_t d2s8, d3s8;
|
||||
|
||||
q11u8 = vabdq_u8(q3, q4);
|
||||
q12u8 = vabdq_u8(q4, q5);
|
||||
q13u8 = vabdq_u8(q5, q6);
|
||||
q14u8 = vabdq_u8(q8, q7);
|
||||
q3 = vabdq_u8(q9, q8);
|
||||
q4 = vabdq_u8(q10, q9);
|
||||
|
||||
q11u8 = vmaxq_u8(q11u8, q12u8);
|
||||
q12u8 = vmaxq_u8(q13u8, q14u8);
|
||||
q3 = vmaxq_u8(q3, q4);
|
||||
q15u8 = vmaxq_u8(q11u8, q12u8);
|
||||
|
||||
q9 = vabdq_u8(q6, q7);
|
||||
|
||||
// vp8_hevmask
|
||||
q13u8 = vcgtq_u8(q13u8, qthresh);
|
||||
q14u8 = vcgtq_u8(q14u8, qthresh);
|
||||
q15u8 = vmaxq_u8(q15u8, q3);
|
||||
|
||||
q2u8 = vabdq_u8(q5, q8);
|
||||
q9 = vqaddq_u8(q9, q9);
|
||||
|
||||
q15u8 = vcgeq_u8(qlimit, q15u8);
|
||||
|
||||
// vp8_filter() function
|
||||
// convert to signed
|
||||
q10 = vdupq_n_u8(0x80);
|
||||
q8 = veorq_u8(q8, q10);
|
||||
q7 = veorq_u8(q7, q10);
|
||||
q6 = veorq_u8(q6, q10);
|
||||
q5 = veorq_u8(q5, q10);
|
||||
|
||||
q2u8 = vshrq_n_u8(q2u8, 1);
|
||||
q9 = vqaddq_u8(q9, q2u8);
|
||||
|
||||
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
|
||||
vget_low_s8(vreinterpretq_s8_u8(q6)));
|
||||
q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
|
||||
vget_high_s8(vreinterpretq_s8_u8(q6)));
|
||||
|
||||
q9 = vcgeq_u8(qblimit, q9);
|
||||
|
||||
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
|
||||
|
||||
q14u8 = vorrq_u8(q13u8, q14u8);
|
||||
|
||||
q4u16 = vdupq_n_u16(3);
|
||||
q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
|
||||
q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
|
||||
|
||||
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
|
||||
q15u8 = vandq_u8(q15u8, q9);
|
||||
|
||||
q1s8 = vreinterpretq_s8_u8(q1u8);
|
||||
q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
|
||||
q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
|
||||
|
||||
q4 = vdupq_n_u8(3);
|
||||
q9 = vdupq_n_u8(4);
|
||||
// vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
|
||||
d2s8 = vqmovn_s16(q2s16);
|
||||
d3s8 = vqmovn_s16(q11s16);
|
||||
q1s8 = vcombine_s8(d2s8, d3s8);
|
||||
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
|
||||
q1s8 = vreinterpretq_s8_u8(q1u8);
|
||||
|
||||
q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
|
||||
q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
|
||||
q2s8 = vshrq_n_s8(q2s8, 3);
|
||||
q1s8 = vshrq_n_s8(q1s8, 3);
|
||||
|
||||
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
|
||||
q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
|
||||
|
||||
q1s8 = vrshrq_n_s8(q1s8, 1);
|
||||
q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
|
||||
|
||||
q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
|
||||
q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
|
||||
|
||||
*q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
|
||||
*q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
|
||||
*q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
|
||||
*q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
|
||||
return;
|
||||
}
|
||||
|
||||
void vpx_lpf_horizontal_4_dual_neon(
|
||||
uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
|
||||
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
|
||||
const uint8_t *limit1, const uint8_t *thresh1) {
|
||||
uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
|
||||
uint8x16_t qblimit, qlimit, qthresh;
|
||||
uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
|
||||
|
||||
dblimit0 = vld1_u8(blimit0);
|
||||
dlimit0 = vld1_u8(limit0);
|
||||
dthresh0 = vld1_u8(thresh0);
|
||||
dblimit1 = vld1_u8(blimit1);
|
||||
dlimit1 = vld1_u8(limit1);
|
||||
dthresh1 = vld1_u8(thresh1);
|
||||
qblimit = vcombine_u8(dblimit0, dblimit1);
|
||||
qlimit = vcombine_u8(dlimit0, dlimit1);
|
||||
qthresh = vcombine_u8(dthresh0, dthresh1);
|
||||
|
||||
s -= (p << 2);
|
||||
|
||||
q3u8 = vld1q_u8(s);
|
||||
s += p;
|
||||
q4u8 = vld1q_u8(s);
|
||||
s += p;
|
||||
q5u8 = vld1q_u8(s);
|
||||
s += p;
|
||||
q6u8 = vld1q_u8(s);
|
||||
s += p;
|
||||
q7u8 = vld1q_u8(s);
|
||||
s += p;
|
||||
q8u8 = vld1q_u8(s);
|
||||
s += p;
|
||||
q9u8 = vld1q_u8(s);
|
||||
s += p;
|
||||
q10u8 = vld1q_u8(s);
|
||||
|
||||
loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
|
||||
q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
|
||||
|
||||
s -= (p * 5);
|
||||
vst1q_u8(s, q5u8);
|
||||
s += p;
|
||||
vst1q_u8(s, q6u8);
|
||||
s += p;
|
||||
vst1q_u8(s, q7u8);
|
||||
s += p;
|
||||
vst1q_u8(s, q8u8);
|
||||
return;
|
||||
}
|
@ -1,249 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
static INLINE void loop_filter_neon(uint8x8_t dblimit, // flimit
|
||||
uint8x8_t dlimit, // limit
|
||||
uint8x8_t dthresh, // thresh
|
||||
uint8x8_t d3u8, // p3
|
||||
uint8x8_t d4u8, // p2
|
||||
uint8x8_t d5u8, // p1
|
||||
uint8x8_t d6u8, // p0
|
||||
uint8x8_t d7u8, // q0
|
||||
uint8x8_t d16u8, // q1
|
||||
uint8x8_t d17u8, // q2
|
||||
uint8x8_t d18u8, // q3
|
||||
uint8x8_t *d4ru8, // p1
|
||||
uint8x8_t *d5ru8, // p0
|
||||
uint8x8_t *d6ru8, // q0
|
||||
uint8x8_t *d7ru8) { // q1
|
||||
uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
|
||||
int16x8_t q12s16;
|
||||
int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
|
||||
|
||||
d19u8 = vabd_u8(d3u8, d4u8);
|
||||
d20u8 = vabd_u8(d4u8, d5u8);
|
||||
d21u8 = vabd_u8(d5u8, d6u8);
|
||||
d22u8 = vabd_u8(d16u8, d7u8);
|
||||
d3u8 = vabd_u8(d17u8, d16u8);
|
||||
d4u8 = vabd_u8(d18u8, d17u8);
|
||||
|
||||
d19u8 = vmax_u8(d19u8, d20u8);
|
||||
d20u8 = vmax_u8(d21u8, d22u8);
|
||||
d3u8 = vmax_u8(d3u8, d4u8);
|
||||
d23u8 = vmax_u8(d19u8, d20u8);
|
||||
|
||||
d17u8 = vabd_u8(d6u8, d7u8);
|
||||
|
||||
d21u8 = vcgt_u8(d21u8, dthresh);
|
||||
d22u8 = vcgt_u8(d22u8, dthresh);
|
||||
d23u8 = vmax_u8(d23u8, d3u8);
|
||||
|
||||
d28u8 = vabd_u8(d5u8, d16u8);
|
||||
d17u8 = vqadd_u8(d17u8, d17u8);
|
||||
|
||||
d23u8 = vcge_u8(dlimit, d23u8);
|
||||
|
||||
d18u8 = vdup_n_u8(0x80);
|
||||
d5u8 = veor_u8(d5u8, d18u8);
|
||||
d6u8 = veor_u8(d6u8, d18u8);
|
||||
d7u8 = veor_u8(d7u8, d18u8);
|
||||
d16u8 = veor_u8(d16u8, d18u8);
|
||||
|
||||
d28u8 = vshr_n_u8(d28u8, 1);
|
||||
d17u8 = vqadd_u8(d17u8, d28u8);
|
||||
|
||||
d19u8 = vdup_n_u8(3);
|
||||
|
||||
d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
|
||||
|
||||
d17u8 = vcge_u8(dblimit, d17u8);
|
||||
|
||||
d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
|
||||
|
||||
d22u8 = vorr_u8(d21u8, d22u8);
|
||||
|
||||
q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
|
||||
|
||||
d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
|
||||
d23u8 = vand_u8(d23u8, d17u8);
|
||||
|
||||
q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
|
||||
|
||||
d17u8 = vdup_n_u8(4);
|
||||
|
||||
d27s8 = vqmovn_s16(q12s16);
|
||||
d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
|
||||
d27s8 = vreinterpret_s8_u8(d27u8);
|
||||
|
||||
d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
|
||||
d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
|
||||
d28s8 = vshr_n_s8(d28s8, 3);
|
||||
d27s8 = vshr_n_s8(d27s8, 3);
|
||||
|
||||
d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
|
||||
d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
|
||||
|
||||
d27s8 = vrshr_n_s8(d27s8, 1);
|
||||
d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
|
||||
|
||||
d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
|
||||
d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
|
||||
|
||||
*d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
|
||||
*d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
|
||||
*d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
|
||||
*d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
|
||||
return;
|
||||
}
|
||||
|
||||
void vpx_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
|
||||
const uint8_t *limit, const uint8_t *thresh) {
|
||||
int i;
|
||||
uint8_t *s, *psrc;
|
||||
uint8x8_t dblimit, dlimit, dthresh;
|
||||
uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
|
||||
|
||||
dblimit = vld1_u8(blimit);
|
||||
dlimit = vld1_u8(limit);
|
||||
dthresh = vld1_u8(thresh);
|
||||
|
||||
psrc = src - (pitch << 2);
|
||||
for (i = 0; i < 1; i++) {
|
||||
s = psrc + i * 8;
|
||||
|
||||
d3u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d4u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d5u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d6u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d7u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d16u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d17u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d18u8 = vld1_u8(s);
|
||||
|
||||
loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
|
||||
d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
|
||||
|
||||
s -= (pitch * 5);
|
||||
vst1_u8(s, d4u8);
|
||||
s += pitch;
|
||||
vst1_u8(s, d5u8);
|
||||
s += pitch;
|
||||
vst1_u8(s, d6u8);
|
||||
s += pitch;
|
||||
vst1_u8(s, d7u8);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void vpx_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
|
||||
const uint8_t *limit, const uint8_t *thresh) {
|
||||
int i, pitch8;
|
||||
uint8_t *s;
|
||||
uint8x8_t dblimit, dlimit, dthresh;
|
||||
uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
|
||||
uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
|
||||
uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
|
||||
uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
|
||||
uint8x8x4_t d4Result;
|
||||
|
||||
dblimit = vld1_u8(blimit);
|
||||
dlimit = vld1_u8(limit);
|
||||
dthresh = vld1_u8(thresh);
|
||||
|
||||
pitch8 = pitch * 8;
|
||||
for (i = 0; i < 1; i++, src += pitch8) {
|
||||
s = src - (i + 1) * 4;
|
||||
|
||||
d3u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d4u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d5u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d6u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d7u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d16u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d17u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d18u8 = vld1_u8(s);
|
||||
|
||||
d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
|
||||
d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
|
||||
d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
|
||||
d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
|
||||
|
||||
d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
|
||||
vreinterpret_u16_u32(d2tmp2.val[0]));
|
||||
d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
|
||||
vreinterpret_u16_u32(d2tmp3.val[0]));
|
||||
d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
|
||||
vreinterpret_u16_u32(d2tmp2.val[1]));
|
||||
d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
|
||||
vreinterpret_u16_u32(d2tmp3.val[1]));
|
||||
|
||||
d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
|
||||
vreinterpret_u8_u16(d2tmp5.val[0]));
|
||||
d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
|
||||
vreinterpret_u8_u16(d2tmp5.val[1]));
|
||||
d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
|
||||
vreinterpret_u8_u16(d2tmp7.val[0]));
|
||||
d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
|
||||
vreinterpret_u8_u16(d2tmp7.val[1]));
|
||||
|
||||
d3u8 = d2tmp8.val[0];
|
||||
d4u8 = d2tmp8.val[1];
|
||||
d5u8 = d2tmp9.val[0];
|
||||
d6u8 = d2tmp9.val[1];
|
||||
d7u8 = d2tmp10.val[0];
|
||||
d16u8 = d2tmp10.val[1];
|
||||
d17u8 = d2tmp11.val[0];
|
||||
d18u8 = d2tmp11.val[1];
|
||||
|
||||
loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
|
||||
d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
|
||||
|
||||
d4Result.val[0] = d4u8;
|
||||
d4Result.val[1] = d5u8;
|
||||
d4Result.val[2] = d6u8;
|
||||
d4Result.val[3] = d7u8;
|
||||
|
||||
src -= 2;
|
||||
vst4_lane_u8(src, d4Result, 0);
|
||||
src += pitch;
|
||||
vst4_lane_u8(src, d4Result, 1);
|
||||
src += pitch;
|
||||
vst4_lane_u8(src, d4Result, 2);
|
||||
src += pitch;
|
||||
vst4_lane_u8(src, d4Result, 3);
|
||||
src += pitch;
|
||||
vst4_lane_u8(src, d4Result, 4);
|
||||
src += pitch;
|
||||
vst4_lane_u8(src, d4Result, 5);
|
||||
src += pitch;
|
||||
vst4_lane_u8(src, d4Result, 6);
|
||||
src += pitch;
|
||||
vst4_lane_u8(src, d4Result, 7);
|
||||
}
|
||||
return;
|
||||
}
|
@ -1,445 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit
|
||||
uint8x8_t dlimit, // limit
|
||||
uint8x8_t dthresh, // thresh
|
||||
uint8x8_t d3u8, // p2
|
||||
uint8x8_t d4u8, // p2
|
||||
uint8x8_t d5u8, // p1
|
||||
uint8x8_t d6u8, // p0
|
||||
uint8x8_t d7u8, // q0
|
||||
uint8x8_t d16u8, // q1
|
||||
uint8x8_t d17u8, // q2
|
||||
uint8x8_t d18u8, // q3
|
||||
uint8x8_t *d0ru8, // p1
|
||||
uint8x8_t *d1ru8, // p1
|
||||
uint8x8_t *d2ru8, // p0
|
||||
uint8x8_t *d3ru8, // q0
|
||||
uint8x8_t *d4ru8, // q1
|
||||
uint8x8_t *d5ru8) { // q1
|
||||
uint32_t flat;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
|
||||
uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
|
||||
int16x8_t q15s16;
|
||||
uint16x8_t q10u16, q14u16;
|
||||
int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
|
||||
|
||||
d19u8 = vabd_u8(d3u8, d4u8);
|
||||
d20u8 = vabd_u8(d4u8, d5u8);
|
||||
d21u8 = vabd_u8(d5u8, d6u8);
|
||||
d22u8 = vabd_u8(d16u8, d7u8);
|
||||
d23u8 = vabd_u8(d17u8, d16u8);
|
||||
d24u8 = vabd_u8(d18u8, d17u8);
|
||||
|
||||
d19u8 = vmax_u8(d19u8, d20u8);
|
||||
d20u8 = vmax_u8(d21u8, d22u8);
|
||||
|
||||
d25u8 = vabd_u8(d6u8, d4u8);
|
||||
|
||||
d23u8 = vmax_u8(d23u8, d24u8);
|
||||
|
||||
d26u8 = vabd_u8(d7u8, d17u8);
|
||||
|
||||
d19u8 = vmax_u8(d19u8, d20u8);
|
||||
|
||||
d24u8 = vabd_u8(d6u8, d7u8);
|
||||
d27u8 = vabd_u8(d3u8, d6u8);
|
||||
d28u8 = vabd_u8(d18u8, d7u8);
|
||||
|
||||
d19u8 = vmax_u8(d19u8, d23u8);
|
||||
|
||||
d23u8 = vabd_u8(d5u8, d16u8);
|
||||
d24u8 = vqadd_u8(d24u8, d24u8);
|
||||
|
||||
d19u8 = vcge_u8(dlimit, d19u8);
|
||||
|
||||
d25u8 = vmax_u8(d25u8, d26u8);
|
||||
d26u8 = vmax_u8(d27u8, d28u8);
|
||||
|
||||
d23u8 = vshr_n_u8(d23u8, 1);
|
||||
|
||||
d25u8 = vmax_u8(d25u8, d26u8);
|
||||
|
||||
d24u8 = vqadd_u8(d24u8, d23u8);
|
||||
|
||||
d20u8 = vmax_u8(d20u8, d25u8);
|
||||
|
||||
d23u8 = vdup_n_u8(1);
|
||||
d24u8 = vcge_u8(dblimit, d24u8);
|
||||
|
||||
d21u8 = vcgt_u8(d21u8, dthresh);
|
||||
|
||||
d20u8 = vcge_u8(d23u8, d20u8);
|
||||
|
||||
d19u8 = vand_u8(d19u8, d24u8);
|
||||
|
||||
d23u8 = vcgt_u8(d22u8, dthresh);
|
||||
|
||||
d20u8 = vand_u8(d20u8, d19u8);
|
||||
|
||||
d22u8 = vdup_n_u8(0x80);
|
||||
|
||||
d23u8 = vorr_u8(d21u8, d23u8);
|
||||
|
||||
q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
|
||||
|
||||
d30u8 = vshrn_n_u16(q10u16, 4);
|
||||
flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
|
||||
|
||||
if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
|
||||
d27u8 = vdup_n_u8(3);
|
||||
d21u8 = vdup_n_u8(2);
|
||||
q14u16 = vaddl_u8(d6u8, d7u8);
|
||||
q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
|
||||
q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
|
||||
q14u16 = vaddw_u8(q14u16, d5u8);
|
||||
*d0ru8 = vqrshrn_n_u16(q14u16, 3);
|
||||
|
||||
q14u16 = vsubw_u8(q14u16, d3u8);
|
||||
q14u16 = vsubw_u8(q14u16, d4u8);
|
||||
q14u16 = vaddw_u8(q14u16, d5u8);
|
||||
q14u16 = vaddw_u8(q14u16, d16u8);
|
||||
*d1ru8 = vqrshrn_n_u16(q14u16, 3);
|
||||
|
||||
q14u16 = vsubw_u8(q14u16, d3u8);
|
||||
q14u16 = vsubw_u8(q14u16, d5u8);
|
||||
q14u16 = vaddw_u8(q14u16, d6u8);
|
||||
q14u16 = vaddw_u8(q14u16, d17u8);
|
||||
*d2ru8 = vqrshrn_n_u16(q14u16, 3);
|
||||
|
||||
q14u16 = vsubw_u8(q14u16, d3u8);
|
||||
q14u16 = vsubw_u8(q14u16, d6u8);
|
||||
q14u16 = vaddw_u8(q14u16, d7u8);
|
||||
q14u16 = vaddw_u8(q14u16, d18u8);
|
||||
*d3ru8 = vqrshrn_n_u16(q14u16, 3);
|
||||
|
||||
q14u16 = vsubw_u8(q14u16, d4u8);
|
||||
q14u16 = vsubw_u8(q14u16, d7u8);
|
||||
q14u16 = vaddw_u8(q14u16, d16u8);
|
||||
q14u16 = vaddw_u8(q14u16, d18u8);
|
||||
*d4ru8 = vqrshrn_n_u16(q14u16, 3);
|
||||
|
||||
q14u16 = vsubw_u8(q14u16, d5u8);
|
||||
q14u16 = vsubw_u8(q14u16, d16u8);
|
||||
q14u16 = vaddw_u8(q14u16, d17u8);
|
||||
q14u16 = vaddw_u8(q14u16, d18u8);
|
||||
*d5ru8 = vqrshrn_n_u16(q14u16, 3);
|
||||
} else {
|
||||
d21u8 = veor_u8(d7u8, d22u8);
|
||||
d24u8 = veor_u8(d6u8, d22u8);
|
||||
d25u8 = veor_u8(d5u8, d22u8);
|
||||
d26u8 = veor_u8(d16u8, d22u8);
|
||||
|
||||
d27u8 = vdup_n_u8(3);
|
||||
|
||||
d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
|
||||
d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
|
||||
|
||||
q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
|
||||
|
||||
d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
|
||||
|
||||
q15s16 = vaddw_s8(q15s16, d29s8);
|
||||
|
||||
d29u8 = vdup_n_u8(4);
|
||||
|
||||
d28s8 = vqmovn_s16(q15s16);
|
||||
|
||||
d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
|
||||
|
||||
d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
|
||||
d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
|
||||
d30s8 = vshr_n_s8(d30s8, 3);
|
||||
d29s8 = vshr_n_s8(d29s8, 3);
|
||||
|
||||
d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
|
||||
d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
|
||||
|
||||
d29s8 = vrshr_n_s8(d29s8, 1);
|
||||
d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
|
||||
|
||||
d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
|
||||
d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
|
||||
|
||||
if (flat == 0) { // filter_branch_only
|
||||
*d0ru8 = d4u8;
|
||||
*d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
|
||||
*d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
|
||||
*d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
|
||||
*d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
|
||||
*d5ru8 = d17u8;
|
||||
return;
|
||||
}
|
||||
|
||||
d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
|
||||
d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
|
||||
d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
|
||||
d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
|
||||
|
||||
d23u8 = vdup_n_u8(2);
|
||||
q14u16 = vaddl_u8(d6u8, d7u8);
|
||||
q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
|
||||
q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
|
||||
|
||||
d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
|
||||
|
||||
q14u16 = vaddw_u8(q14u16, d5u8);
|
||||
|
||||
d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
|
||||
|
||||
d30u8 = vqrshrn_n_u16(q14u16, 3);
|
||||
|
||||
q14u16 = vsubw_u8(q14u16, d3u8);
|
||||
q14u16 = vsubw_u8(q14u16, d4u8);
|
||||
q14u16 = vaddw_u8(q14u16, d5u8);
|
||||
q14u16 = vaddw_u8(q14u16, d16u8);
|
||||
|
||||
d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
|
||||
|
||||
d31u8 = vqrshrn_n_u16(q14u16, 3);
|
||||
|
||||
q14u16 = vsubw_u8(q14u16, d3u8);
|
||||
q14u16 = vsubw_u8(q14u16, d5u8);
|
||||
q14u16 = vaddw_u8(q14u16, d6u8);
|
||||
q14u16 = vaddw_u8(q14u16, d17u8);
|
||||
|
||||
*d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
|
||||
|
||||
d23u8 = vqrshrn_n_u16(q14u16, 3);
|
||||
|
||||
q14u16 = vsubw_u8(q14u16, d3u8);
|
||||
q14u16 = vsubw_u8(q14u16, d6u8);
|
||||
q14u16 = vaddw_u8(q14u16, d7u8);
|
||||
|
||||
*d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
|
||||
|
||||
q14u16 = vaddw_u8(q14u16, d18u8);
|
||||
|
||||
*d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
|
||||
|
||||
d22u8 = vqrshrn_n_u16(q14u16, 3);
|
||||
|
||||
q14u16 = vsubw_u8(q14u16, d4u8);
|
||||
q14u16 = vsubw_u8(q14u16, d7u8);
|
||||
q14u16 = vaddw_u8(q14u16, d16u8);
|
||||
|
||||
d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
|
||||
|
||||
q14u16 = vaddw_u8(q14u16, d18u8);
|
||||
|
||||
d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
|
||||
|
||||
d6u8 = vqrshrn_n_u16(q14u16, 3);
|
||||
|
||||
q14u16 = vsubw_u8(q14u16, d5u8);
|
||||
q14u16 = vsubw_u8(q14u16, d16u8);
|
||||
q14u16 = vaddw_u8(q14u16, d17u8);
|
||||
q14u16 = vaddw_u8(q14u16, d18u8);
|
||||
|
||||
d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
|
||||
|
||||
d7u8 = vqrshrn_n_u16(q14u16, 3);
|
||||
|
||||
*d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
|
||||
*d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
|
||||
*d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
|
||||
const uint8_t *limit, const uint8_t *thresh) {
|
||||
int i;
|
||||
uint8_t *s, *psrc;
|
||||
uint8x8_t dblimit, dlimit, dthresh;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
||||
uint8x8_t d16u8, d17u8, d18u8;
|
||||
|
||||
dblimit = vld1_u8(blimit);
|
||||
dlimit = vld1_u8(limit);
|
||||
dthresh = vld1_u8(thresh);
|
||||
|
||||
psrc = src - (pitch << 2);
|
||||
for (i = 0; i < 1; i++) {
|
||||
s = psrc + i * 8;
|
||||
|
||||
d3u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d4u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d5u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d6u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d7u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d16u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d17u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d18u8 = vld1_u8(s);
|
||||
|
||||
mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
|
||||
d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
|
||||
&d5u8);
|
||||
|
||||
s -= (pitch * 6);
|
||||
vst1_u8(s, d0u8);
|
||||
s += pitch;
|
||||
vst1_u8(s, d1u8);
|
||||
s += pitch;
|
||||
vst1_u8(s, d2u8);
|
||||
s += pitch;
|
||||
vst1_u8(s, d3u8);
|
||||
s += pitch;
|
||||
vst1_u8(s, d4u8);
|
||||
s += pitch;
|
||||
vst1_u8(s, d5u8);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void vpx_lpf_horizontal_8_dual_neon(
|
||||
uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
|
||||
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
|
||||
const uint8_t *limit1, const uint8_t *thresh1) {
|
||||
vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
|
||||
vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
|
||||
}
|
||||
|
||||
void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
|
||||
const uint8_t *limit, const uint8_t *thresh) {
|
||||
int i;
|
||||
uint8_t *s;
|
||||
uint8x8_t dblimit, dlimit, dthresh;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
||||
uint8x8_t d16u8, d17u8, d18u8;
|
||||
uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
|
||||
uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
|
||||
uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
|
||||
uint8x8x4_t d4Result;
|
||||
uint8x8x2_t d2Result;
|
||||
|
||||
dblimit = vld1_u8(blimit);
|
||||
dlimit = vld1_u8(limit);
|
||||
dthresh = vld1_u8(thresh);
|
||||
|
||||
for (i = 0; i < 1; i++) {
|
||||
s = src + (i * (pitch << 3)) - 4;
|
||||
|
||||
d3u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d4u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d5u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d6u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d7u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d16u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d17u8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d18u8 = vld1_u8(s);
|
||||
|
||||
d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
|
||||
d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
|
||||
d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
|
||||
d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
|
||||
|
||||
d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
|
||||
vreinterpret_u16_u32(d2tmp2.val[0]));
|
||||
d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
|
||||
vreinterpret_u16_u32(d2tmp3.val[0]));
|
||||
d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
|
||||
vreinterpret_u16_u32(d2tmp2.val[1]));
|
||||
d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
|
||||
vreinterpret_u16_u32(d2tmp3.val[1]));
|
||||
|
||||
d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
|
||||
vreinterpret_u8_u16(d2tmp5.val[0]));
|
||||
d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
|
||||
vreinterpret_u8_u16(d2tmp5.val[1]));
|
||||
d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
|
||||
vreinterpret_u8_u16(d2tmp7.val[0]));
|
||||
d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
|
||||
vreinterpret_u8_u16(d2tmp7.val[1]));
|
||||
|
||||
d3u8 = d2tmp8.val[0];
|
||||
d4u8 = d2tmp8.val[1];
|
||||
d5u8 = d2tmp9.val[0];
|
||||
d6u8 = d2tmp9.val[1];
|
||||
d7u8 = d2tmp10.val[0];
|
||||
d16u8 = d2tmp10.val[1];
|
||||
d17u8 = d2tmp11.val[0];
|
||||
d18u8 = d2tmp11.val[1];
|
||||
|
||||
mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
|
||||
d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
|
||||
&d5u8);
|
||||
|
||||
d4Result.val[0] = d0u8;
|
||||
d4Result.val[1] = d1u8;
|
||||
d4Result.val[2] = d2u8;
|
||||
d4Result.val[3] = d3u8;
|
||||
|
||||
d2Result.val[0] = d4u8;
|
||||
d2Result.val[1] = d5u8;
|
||||
|
||||
s = src - 3;
|
||||
vst4_lane_u8(s, d4Result, 0);
|
||||
s += pitch;
|
||||
vst4_lane_u8(s, d4Result, 1);
|
||||
s += pitch;
|
||||
vst4_lane_u8(s, d4Result, 2);
|
||||
s += pitch;
|
||||
vst4_lane_u8(s, d4Result, 3);
|
||||
s += pitch;
|
||||
vst4_lane_u8(s, d4Result, 4);
|
||||
s += pitch;
|
||||
vst4_lane_u8(s, d4Result, 5);
|
||||
s += pitch;
|
||||
vst4_lane_u8(s, d4Result, 6);
|
||||
s += pitch;
|
||||
vst4_lane_u8(s, d4Result, 7);
|
||||
|
||||
s = src + 1;
|
||||
vst2_lane_u8(s, d2Result, 0);
|
||||
s += pitch;
|
||||
vst2_lane_u8(s, d2Result, 1);
|
||||
s += pitch;
|
||||
vst2_lane_u8(s, d2Result, 2);
|
||||
s += pitch;
|
||||
vst2_lane_u8(s, d2Result, 3);
|
||||
s += pitch;
|
||||
vst2_lane_u8(s, d2Result, 4);
|
||||
s += pitch;
|
||||
vst2_lane_u8(s, d2Result, 5);
|
||||
s += pitch;
|
||||
vst2_lane_u8(s, d2Result, 6);
|
||||
s += pitch;
|
||||
vst2_lane_u8(s, d2Result, 7);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
|
||||
const uint8_t *limit0, const uint8_t *thresh0,
|
||||
const uint8_t *blimit1, const uint8_t *limit1,
|
||||
const uint8_t *thresh1) {
|
||||
vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
|
||||
vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
|
||||
}
|
@ -31,6 +31,15 @@ FUN_LOAD_THRESH(8, _) // load_thresh_8
|
||||
FUN_LOAD_THRESH(16, q_) // load_thresh_16
|
||||
#undef FUN_LOAD_THRESH
|
||||
|
||||
static INLINE void load_thresh_8_dual(
|
||||
const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0,
|
||||
const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,
|
||||
uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) {
|
||||
*blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1));
|
||||
*limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1));
|
||||
*thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1));
|
||||
}
|
||||
|
||||
// Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a
|
||||
// pixel. When used to control filter branches, we only detect whether it is all
|
||||
// 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
|
||||
@ -56,6 +65,38 @@ static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) {
|
||||
return calc_flat_status_8(flat_4bit);
|
||||
}
|
||||
|
||||
#define FUN_FILTER_HEV_MASK4(w, r) \
|
||||
static INLINE uint8x##w##_t filter_hev_mask4_##w( \
|
||||
const uint8x##w##_t limit, const uint8x##w##_t blimit, \
|
||||
const uint8x##w##_t thresh, const uint8x##w##_t p3, \
|
||||
const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
|
||||
const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
|
||||
const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) { \
|
||||
uint8x##w##_t max, t0, t1; \
|
||||
\
|
||||
max = vabd##r##u8(p1, p0); \
|
||||
max = vmax##r##u8(max, vabd##r##u8(q1, q0)); \
|
||||
*hev = vcgt##r##u8(max, thresh); \
|
||||
*mask = vmax##r##u8(max, vabd##r##u8(p3, p2)); \
|
||||
*mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1)); \
|
||||
*mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1)); \
|
||||
*mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2)); \
|
||||
t0 = vabd##r##u8(p0, q0); \
|
||||
t1 = vabd##r##u8(p1, q1); \
|
||||
t0 = vqadd##r##u8(t0, t0); \
|
||||
t1 = vshr##r##n_u8(t1, 1); \
|
||||
t0 = vqadd##r##u8(t0, t1); \
|
||||
*mask = vcle##r##u8(*mask, limit); \
|
||||
t0 = vcle##r##u8(t0, blimit); \
|
||||
*mask = vand##r##u8(*mask, t0); \
|
||||
\
|
||||
return max; \
|
||||
}
|
||||
|
||||
FUN_FILTER_HEV_MASK4(8, _) // filter_hev_mask4_8
|
||||
FUN_FILTER_HEV_MASK4(16, q_) // filter_hev_mask4_16
|
||||
#undef FUN_FILTER_HEV_MASK4
|
||||
|
||||
#define FUN_FILTER_FLAT_HEV_MASK(w, r) \
|
||||
static INLINE uint8x##w##_t filter_flat_hev_mask_##w( \
|
||||
const uint8x##w##_t limit, const uint8x##w##_t blimit, \
|
||||
@ -64,25 +105,11 @@ static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) {
|
||||
const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
|
||||
const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status, \
|
||||
uint8x##w##_t *hev) { \
|
||||
uint8x##w##_t t0, t1, mask; \
|
||||
\
|
||||
mask = vabd##r##u8(p1, p0); \
|
||||
mask = vmax##r##u8(mask, vabd##r##u8(q1, q0)); \
|
||||
*hev = vcgt##r##u8(mask, thresh); \
|
||||
*flat = vmax##r##u8(mask, vabd##r##u8(p2, p0)); \
|
||||
mask = vmax##r##u8(mask, vabd##r##u8(p3, p2)); \
|
||||
mask = vmax##r##u8(mask, vabd##r##u8(p2, p1)); \
|
||||
mask = vmax##r##u8(mask, vabd##r##u8(q2, q1)); \
|
||||
mask = vmax##r##u8(mask, vabd##r##u8(q3, q2)); \
|
||||
t0 = vabd##r##u8(p0, q0); \
|
||||
t1 = vabd##r##u8(p1, q1); \
|
||||
t0 = vqadd##r##u8(t0, t0); \
|
||||
t1 = vshr##r##n_u8(t1, 1); \
|
||||
t0 = vqadd##r##u8(t0, t1); \
|
||||
mask = vcle##r##u8(mask, limit); \
|
||||
t0 = vcle##r##u8(t0, blimit); \
|
||||
mask = vand##r##u8(mask, t0); \
|
||||
uint8x##w##_t max, mask; \
|
||||
\
|
||||
max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \
|
||||
q2, q3, hev, &mask); \
|
||||
*flat = vmax##r##u8(max, vabd##r##u8(p2, p0)); \
|
||||
*flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0)); \
|
||||
*flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0)); \
|
||||
*flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0)); \
|
||||
@ -420,6 +447,33 @@ FUN_FILTER4(8, _) // filter4_8
|
||||
FUN_FILTER4(16, q_) // filter4_16
|
||||
#undef FUN_FILTER4
|
||||
|
||||
#define FUN_FILTER8(w) \
|
||||
static INLINE void filter8_##w( \
|
||||
const uint8x##w##_t mask, const uint8x##w##_t flat, \
|
||||
const uint32_t flat_status, const uint8x##w##_t hev, \
|
||||
const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1, \
|
||||
const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1, \
|
||||
const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2, \
|
||||
uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \
|
||||
uint8x##w##_t *oq1, uint8x##w##_t *oq2) { \
|
||||
if (flat_status != (uint32_t)-2) { \
|
||||
filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \
|
||||
*op2 = p2; \
|
||||
*oq2 = q2; \
|
||||
if (flat_status) { \
|
||||
apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
|
||||
op0, oq0, oq1, oq2); \
|
||||
} \
|
||||
} else { \
|
||||
calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, \
|
||||
oq0, oq1, oq2); \
|
||||
} \
|
||||
}
|
||||
|
||||
FUN_FILTER8(8) // filter8_8
|
||||
FUN_FILTER8(16) // filter8_16
|
||||
#undef FUN_FILTER8
|
||||
|
||||
#define FUN_FILTER16(w) \
|
||||
static INLINE void filter16_##w( \
|
||||
const uint8x##w##_t mask, const uint8x##w##_t flat, \
|
||||
@ -481,6 +535,7 @@ FUN_FILTER16(16) // filter16_16
|
||||
*q3 = vld1##r##u8(s); \
|
||||
}
|
||||
|
||||
FUN_LOAD8(8, _) // load_8x8
|
||||
FUN_LOAD8(16, q_) // load_16x8
|
||||
#undef FUN_LOAD8
|
||||
|
||||
@ -529,6 +584,71 @@ FUN_LOAD16(8, _) // load_8x16
|
||||
FUN_LOAD16(16, q_) // load_16x16
|
||||
#undef FUN_LOAD16
|
||||
|
||||
#define FUN_STORE4(w, r) \
|
||||
static INLINE void store_##w##x4( \
|
||||
uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
|
||||
const uint8x##w##_t s2, const uint8x##w##_t s3) { \
|
||||
vst1##r##u8(s, s0); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s1); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s2); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s3); \
|
||||
}
|
||||
|
||||
FUN_STORE4(8, _) // store_8x4
|
||||
FUN_STORE4(16, q_) // store_16x4
|
||||
#undef FUN_STORE4
|
||||
|
||||
#define FUN_STORE6(w, r) \
|
||||
static INLINE void store_##w##x6( \
|
||||
uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
|
||||
const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \
|
||||
const uint8x##w##_t s5) { \
|
||||
vst1##r##u8(s, s0); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s1); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s2); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s3); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s4); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s5); \
|
||||
}
|
||||
|
||||
FUN_STORE6(8, _) // store_8x6
|
||||
FUN_STORE6(16, q_) // store_16x6
|
||||
#undef FUN_STORE6
|
||||
|
||||
static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
|
||||
const uint8x8_t p0, const uint8x8_t q0,
|
||||
const uint8x8_t q1) {
|
||||
uint8x8x4_t o;
|
||||
|
||||
o.val[0] = p1;
|
||||
o.val[1] = p0;
|
||||
o.val[2] = q0;
|
||||
o.val[3] = q1;
|
||||
vst4_lane_u8(s, o, 0);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 1);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 2);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 3);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 4);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 5);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 6);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 7);
|
||||
}
|
||||
|
||||
static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
|
||||
const uint8x8_t s1, const uint8x8_t s2,
|
||||
const uint8x8_t s3, const uint8x8_t s4,
|
||||
@ -566,53 +686,64 @@ static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
|
||||
vst3_lane_u8(s + 0, o1, 7);
|
||||
}
|
||||
|
||||
static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
|
||||
const uint8x8_t p0, const uint8x8_t q0,
|
||||
const uint8x8_t q1) {
|
||||
uint8x8x4_t o;
|
||||
#define FUN_STORE8(w, r) \
|
||||
static INLINE void store_##w##x8( \
|
||||
uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
|
||||
const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \
|
||||
const uint8x##w##_t s5, const uint8x##w##_t s6, \
|
||||
const uint8x##w##_t s7) { \
|
||||
vst1##r##u8(s, s0); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s1); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s2); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s3); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s4); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s5); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s6); \
|
||||
s += p; \
|
||||
vst1##r##u8(s, s7); \
|
||||
}
|
||||
|
||||
o.val[0] = p1;
|
||||
o.val[1] = p0;
|
||||
o.val[2] = q0;
|
||||
o.val[3] = q1;
|
||||
vst4_lane_u8(s, o, 0);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 1);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 2);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 3);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 4);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 5);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 6);
|
||||
s += p;
|
||||
vst4_lane_u8(s, o, 7);
|
||||
}
|
||||
FUN_STORE8(8, _) // store_8x8
|
||||
FUN_STORE8(16, q_) // store_16x8
|
||||
#undef FUN_STORE8
|
||||
|
||||
static INLINE void store_16x8(uint8_t *s, const int p, const uint8x16_t s0,
|
||||
const uint8x16_t s1, const uint8x16_t s2,
|
||||
const uint8x16_t s3, const uint8x16_t s4,
|
||||
const uint8x16_t s5, const uint8x16_t s6,
|
||||
const uint8x16_t s7) {
|
||||
vst1q_u8(s, s0);
|
||||
s += p;
|
||||
vst1q_u8(s, s1);
|
||||
s += p;
|
||||
vst1q_u8(s, s2);
|
||||
s += p;
|
||||
vst1q_u8(s, s3);
|
||||
s += p;
|
||||
vst1q_u8(s, s4);
|
||||
s += p;
|
||||
vst1q_u8(s, s5);
|
||||
s += p;
|
||||
vst1q_u8(s, s6);
|
||||
s += p;
|
||||
vst1q_u8(s, s7);
|
||||
}
|
||||
#define FUN_STORE14(w, r) \
|
||||
static INLINE void store_##w##x14( \
|
||||
uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
|
||||
const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
|
||||
const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
|
||||
const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
|
||||
const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
|
||||
const uint32_t flat_status, const uint32_t flat2_status) { \
|
||||
if (flat_status) { \
|
||||
if (flat2_status) { \
|
||||
vst1##r##u8(s - 7 * p, p6); \
|
||||
vst1##r##u8(s - 6 * p, p5); \
|
||||
vst1##r##u8(s - 5 * p, p4); \
|
||||
vst1##r##u8(s - 4 * p, p3); \
|
||||
vst1##r##u8(s + 3 * p, q3); \
|
||||
vst1##r##u8(s + 4 * p, q4); \
|
||||
vst1##r##u8(s + 5 * p, q5); \
|
||||
vst1##r##u8(s + 6 * p, q6); \
|
||||
} \
|
||||
vst1##r##u8(s - 3 * p, p2); \
|
||||
vst1##r##u8(s + 2 * p, q2); \
|
||||
} \
|
||||
vst1##r##u8(s - 2 * p, p1); \
|
||||
vst1##r##u8(s - 1 * p, p0); \
|
||||
vst1##r##u8(s + 0 * p, q0); \
|
||||
vst1##r##u8(s + 1 * p, q1); \
|
||||
}
|
||||
|
||||
FUN_STORE14(8, _) // store_8x14
|
||||
FUN_STORE14(16, q_) // store_16x14
|
||||
#undef FUN_STORE14
|
||||
|
||||
static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
|
||||
const uint8x16_t s1, const uint8x16_t s2,
|
||||
@ -656,37 +787,160 @@ static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
|
||||
vst1q_u8(s, s15);
|
||||
}
|
||||
|
||||
#define FUN_STORE14(w, r) \
|
||||
static INLINE void store_##w##x14( \
|
||||
uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
|
||||
const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
|
||||
const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
|
||||
const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
|
||||
const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
|
||||
const uint32_t flat_status, const uint32_t flat2_status) { \
|
||||
if (flat_status) { \
|
||||
if (flat2_status) { \
|
||||
vst1##r##u8(s - 7 * p, p6); \
|
||||
vst1##r##u8(s - 6 * p, p5); \
|
||||
vst1##r##u8(s - 5 * p, p4); \
|
||||
vst1##r##u8(s - 4 * p, p3); \
|
||||
vst1##r##u8(s + 3 * p, q3); \
|
||||
vst1##r##u8(s + 4 * p, q4); \
|
||||
vst1##r##u8(s + 5 * p, q5); \
|
||||
vst1##r##u8(s + 6 * p, q6); \
|
||||
} \
|
||||
vst1##r##u8(s - 3 * p, p2); \
|
||||
vst1##r##u8(s + 2 * p, q2); \
|
||||
} \
|
||||
vst1##r##u8(s - 2 * p, p1); \
|
||||
vst1##r##u8(s - 1 * p, p0); \
|
||||
vst1##r##u8(s + 0 * p, q0); \
|
||||
vst1##r##u8(s + 1 * p, q1); \
|
||||
#define FUN_HOR_4_KERNEL(name, w) \
|
||||
static INLINE void lpf_horizontal_4##name##kernel( \
|
||||
uint8_t *s, const int p, const uint8x##w##_t blimit, \
|
||||
const uint8x##w##_t limit, const uint8x##w##_t thresh) { \
|
||||
uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev; \
|
||||
\
|
||||
load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); \
|
||||
filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \
|
||||
q3, &hev, &mask); \
|
||||
filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); \
|
||||
store_##w##x4(s - 2 * p, p, p1, p0, q0, q1); \
|
||||
}
|
||||
|
||||
FUN_STORE14(8, _) // store_8x14
|
||||
FUN_STORE14(16, q_) // store_16x14
|
||||
#undef FUN_STORE14
|
||||
FUN_HOR_4_KERNEL(_, 8) // lpf_horizontal_4_kernel
|
||||
FUN_HOR_4_KERNEL(_dual_, 16) // lpf_horizontal_4_dual_kernel
|
||||
#undef FUN_HOR_4_KERNEL
|
||||
|
||||
void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit,
|
||||
const uint8_t *limit, const uint8_t *thresh) {
|
||||
uint8x8_t blimit_vec, limit_vec, thresh_vec;
|
||||
load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
|
||||
lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
|
||||
}
|
||||
|
||||
void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
|
||||
const uint8_t *limit0,
|
||||
const uint8_t *thresh0,
|
||||
const uint8_t *blimit1,
|
||||
const uint8_t *limit1,
|
||||
const uint8_t *thresh1) {
|
||||
uint8x16_t blimit_vec, limit_vec, thresh_vec;
|
||||
load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
|
||||
&blimit_vec, &limit_vec, &thresh_vec);
|
||||
lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
|
||||
}
|
||||
|
||||
void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit,
|
||||
const uint8_t *limit, const uint8_t *thresh) {
|
||||
uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
|
||||
mask, hev;
|
||||
load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
|
||||
load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
|
||||
q2, q3, &hev, &mask);
|
||||
filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
|
||||
store_4x8(s - 2, p, p1, p0, q0, q1);
|
||||
}
|
||||
|
||||
void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
|
||||
const uint8_t *limit0, const uint8_t *thresh0,
|
||||
const uint8_t *blimit1, const uint8_t *limit1,
|
||||
const uint8_t *thresh1) {
|
||||
uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
|
||||
mask, hev;
|
||||
uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
|
||||
s15;
|
||||
|
||||
load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
|
||||
&blimit_vec, &limit_vec, &thresh_vec);
|
||||
load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
|
||||
&s11, &s12, &s13, &s14, &s15);
|
||||
transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
|
||||
s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
|
||||
q2, q3, &hev, &mask);
|
||||
filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
|
||||
s -= 2;
|
||||
store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0),
|
||||
vget_low_u8(q1));
|
||||
store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0),
|
||||
vget_high_u8(q1));
|
||||
}
|
||||
|
||||
void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit,
|
||||
const uint8_t *limit, const uint8_t *thresh) {
|
||||
uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
|
||||
op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
|
||||
uint32_t flat_status;
|
||||
|
||||
load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
|
||||
load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
|
||||
p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
|
||||
filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
|
||||
&op1, &op0, &oq0, &oq1, &oq2);
|
||||
store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
|
||||
}
|
||||
|
||||
void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
|
||||
const uint8_t *limit0,
|
||||
const uint8_t *thresh0,
|
||||
const uint8_t *blimit1,
|
||||
const uint8_t *limit1,
|
||||
const uint8_t *thresh1) {
|
||||
uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
|
||||
op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
|
||||
uint32_t flat_status;
|
||||
|
||||
load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
|
||||
&blimit_vec, &limit_vec, &thresh_vec);
|
||||
load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
|
||||
p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
|
||||
filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
|
||||
&op1, &op0, &oq0, &oq1, &oq2);
|
||||
store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
|
||||
}
|
||||
|
||||
void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit,
|
||||
const uint8_t *limit, const uint8_t *thresh) {
|
||||
uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
|
||||
op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
|
||||
uint32_t flat_status;
|
||||
|
||||
load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
|
||||
load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
|
||||
p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
|
||||
filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
|
||||
&op1, &op0, &oq0, &oq1, &oq2);
|
||||
// Note: tranpose + store_8x8() is faster than store_6x8().
|
||||
transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
|
||||
store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
|
||||
}
|
||||
|
||||
void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
|
||||
const uint8_t *limit0, const uint8_t *thresh0,
|
||||
const uint8_t *blimit1, const uint8_t *limit1,
|
||||
const uint8_t *thresh1) {
|
||||
uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
|
||||
op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
|
||||
uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
|
||||
s15;
|
||||
uint32_t flat_status;
|
||||
|
||||
load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
|
||||
&blimit_vec, &limit_vec, &thresh_vec);
|
||||
load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
|
||||
&s11, &s12, &s13, &s14, &s15);
|
||||
transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
|
||||
s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
|
||||
p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
|
||||
filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
|
||||
&op1, &op0, &oq0, &oq1, &oq2);
|
||||
// Note: store_6x8() twice is faster than tranpose + store_8x16().
|
||||
store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
|
||||
vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
|
||||
store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
|
||||
vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
|
||||
vget_high_u8(oq2));
|
||||
}
|
||||
|
||||
#define FUN_LPF_16_KERNEL(name, w) \
|
||||
static INLINE void lpf_16##name##kernel( \
|
||||
@ -784,7 +1038,9 @@ void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
|
||||
&s6, &s7);
|
||||
store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
|
||||
} else {
|
||||
store_6x8(s + 8, p, op2, op1, op0, oq0, oq1, oq2);
|
||||
// Note: tranpose + store_8x8() is faster than store_6x8().
|
||||
transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
|
||||
store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
|
||||
}
|
||||
} else {
|
||||
store_4x8(s + 6, p, op1, op0, oq0, oq1);
|
||||
@ -819,6 +1075,7 @@ void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
|
||||
store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
|
||||
s13, s14, s15);
|
||||
} else {
|
||||
// Note: store_6x8() twice is faster than tranpose + store_8x16().
|
||||
s += 8;
|
||||
store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
|
||||
vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
|
||||
|
@ -39,28 +39,84 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) {
|
||||
return b0;
|
||||
}
|
||||
|
||||
// Note: Using 'd' registers or 'q' registers has almost identical speed. We use
|
||||
// 'q' registers here to save some instructions.
|
||||
static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
|
||||
uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
|
||||
uint8x8_t *a6, uint8x8_t *a7) {
|
||||
// Swap 8 bit elements. Goes from:
|
||||
// a0: 00 01 02 03 04 05 06 07
|
||||
// a1: 10 11 12 13 14 15 16 17
|
||||
// a2: 20 21 22 23 24 25 26 27
|
||||
// a3: 30 31 32 33 34 35 36 37
|
||||
// a4: 40 41 42 43 44 45 46 47
|
||||
// a5: 50 51 52 53 54 55 56 57
|
||||
// a6: 60 61 62 63 64 65 66 67
|
||||
// a7: 70 71 72 73 74 75 76 77
|
||||
// to:
|
||||
// b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
|
||||
// b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
|
||||
// b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
|
||||
// b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
|
||||
|
||||
const uint8x16x2_t b0 =
|
||||
vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
|
||||
const uint8x16x2_t b1 =
|
||||
vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
|
||||
|
||||
// Swap 16 bit elements resulting in:
|
||||
// c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
|
||||
// c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
|
||||
// c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
|
||||
// c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
|
||||
|
||||
const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
|
||||
vreinterpretq_u16_u8(b1.val[0]));
|
||||
const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
|
||||
vreinterpretq_u16_u8(b1.val[1]));
|
||||
|
||||
// Unzip 32 bit elements resulting in:
|
||||
// d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
|
||||
// d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
|
||||
// d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
|
||||
// d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
|
||||
const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
|
||||
vreinterpretq_u32_u16(c1.val[0]));
|
||||
const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
|
||||
vreinterpretq_u32_u16(c1.val[1]));
|
||||
|
||||
*a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
|
||||
*a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
|
||||
*a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
|
||||
*a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
|
||||
*a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
|
||||
*a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
|
||||
*a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
|
||||
*a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
|
||||
}
|
||||
|
||||
static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
|
||||
int16x8_t *a2, int16x8_t *a3,
|
||||
int16x8_t *a4, int16x8_t *a5,
|
||||
int16x8_t *a6, int16x8_t *a7) {
|
||||
// Swap 16 bit elements. Goes from:
|
||||
// a0: 00 01 02 03 04 05 06 07
|
||||
// a1: 08 09 10 11 12 13 14 15
|
||||
// a2: 16 17 18 19 20 21 22 23
|
||||
// a3: 24 25 26 27 28 29 30 31
|
||||
// a4: 32 33 34 35 36 37 38 39
|
||||
// a5: 40 41 42 43 44 45 46 47
|
||||
// a6: 48 49 50 51 52 53 54 55
|
||||
// a7: 56 57 58 59 60 61 62 63
|
||||
// a1: 10 11 12 13 14 15 16 17
|
||||
// a2: 20 21 22 23 24 25 26 27
|
||||
// a3: 30 31 32 33 34 35 36 37
|
||||
// a4: 40 41 42 43 44 45 46 47
|
||||
// a5: 50 51 52 53 54 55 56 57
|
||||
// a6: 60 61 62 63 64 65 66 67
|
||||
// a7: 70 71 72 73 74 75 76 77
|
||||
// to:
|
||||
// b0.val[0]: 00 08 02 10 04 12 06 14
|
||||
// b0.val[1]: 01 09 03 11 05 13 07 15
|
||||
// b1.val[0]: 16 24 18 26 20 28 22 30
|
||||
// b1.val[1]: 17 25 19 27 21 29 23 31
|
||||
// b2.val[0]: 32 40 34 42 36 44 38 46
|
||||
// b2.val[1]: 33 41 35 43 37 45 39 47
|
||||
// b3.val[0]: 48 56 50 58 52 60 54 62
|
||||
// b3.val[1]: 49 57 51 59 53 61 55 63
|
||||
// b0.val[0]: 00 10 02 12 04 14 06 16
|
||||
// b0.val[1]: 01 11 03 13 05 15 07 17
|
||||
// b1.val[0]: 20 30 22 32 24 34 26 36
|
||||
// b1.val[1]: 21 31 23 33 25 35 27 37
|
||||
// b2.val[0]: 40 50 42 52 44 54 46 56
|
||||
// b2.val[1]: 41 51 43 53 45 55 47 57
|
||||
// b3.val[0]: 60 70 62 72 64 74 66 76
|
||||
// b3.val[1]: 61 71 63 73 65 75 67 77
|
||||
|
||||
const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
|
||||
const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
|
||||
@ -68,14 +124,14 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
|
||||
const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
|
||||
|
||||
// Swap 32 bit elements resulting in:
|
||||
// c0.val[0]: 00 08 16 24 04 12 20 28
|
||||
// c0.val[1]: 02 10 18 26 06 14 22 30
|
||||
// c1.val[0]: 01 09 17 25 05 13 21 29
|
||||
// c1.val[1]: 03 11 19 27 07 15 23 31
|
||||
// c2.val[0]: 32 40 48 56 36 44 52 60
|
||||
// c2.val[1]: 34 42 50 58 38 46 54 62
|
||||
// c3.val[0]: 33 41 49 57 37 45 53 61
|
||||
// c3.val[1]: 35 43 51 59 39 47 55 63
|
||||
// c0.val[0]: 00 10 20 30 04 14 24 34
|
||||
// c0.val[1]: 02 12 22 32 06 16 26 36
|
||||
// c1.val[0]: 01 11 21 31 05 15 25 35
|
||||
// c1.val[1]: 03 13 23 33 07 17 27 37
|
||||
// c2.val[0]: 40 50 60 70 44 54 64 74
|
||||
// c2.val[1]: 42 52 62 72 46 56 66 76
|
||||
// c3.val[0]: 41 51 61 71 45 55 65 75
|
||||
// c3.val[1]: 43 53 63 73 47 57 67 77
|
||||
|
||||
const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
|
||||
vreinterpretq_s32_s16(b1.val[0]));
|
||||
@ -87,14 +143,14 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
|
||||
vreinterpretq_s32_s16(b3.val[1]));
|
||||
|
||||
// Swap 64 bit elements resulting in:
|
||||
// d0.val[0]: 00 08 16 24 32 40 48 56
|
||||
// d0.val[1]: 04 12 20 28 36 44 52 60
|
||||
// d1.val[0]: 01 09 17 25 33 41 49 57
|
||||
// d1.val[1]: 05 13 21 29 37 45 53 61
|
||||
// d2.val[0]: 02 10 18 26 34 42 50 58
|
||||
// d2.val[1]: 06 14 22 30 38 46 54 62
|
||||
// d3.val[0]: 03 11 19 27 35 43 51 59
|
||||
// d3.val[1]: 07 15 23 31 39 47 55 63
|
||||
// d0.val[0]: 00 10 20 30 40 50 60 70
|
||||
// d0.val[1]: 04 14 24 34 44 54 64 74
|
||||
// d1.val[0]: 01 11 21 31 41 51 61 71
|
||||
// d1.val[1]: 05 15 25 35 45 55 65 75
|
||||
// d2.val[0]: 02 12 22 32 42 52 62 72
|
||||
// d2.val[1]: 06 16 26 36 46 56 66 76
|
||||
// d3.val[0]: 03 13 23 33 43 53 63 73
|
||||
// d3.val[1]: 07 17 27 37 47 57 67 77
|
||||
const int16x8x2_t d0 = vpx_vtrnq_s64(c0.val[0], c2.val[0]);
|
||||
const int16x8x2_t d1 = vpx_vtrnq_s64(c1.val[0], c3.val[0]);
|
||||
const int16x8x2_t d2 = vpx_vtrnq_s64(c0.val[1], c2.val[1]);
|
||||
|
@ -136,8 +136,8 @@ DSP_SRCS-yes += loopfilter.c
|
||||
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c
|
||||
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c
|
||||
ifeq ($(HAVE_NEON_ASM),yes)
|
||||
DSP_SRCS-yes += arm/loopfilter_neon.c
|
||||
DSP_SRCS-yes += arm/loopfilter_mb_neon$(ASM)
|
||||
DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM)
|
||||
DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM)
|
||||
@ -145,9 +145,6 @@ DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM)
|
||||
else
|
||||
ifeq ($(HAVE_NEON),yes)
|
||||
DSP_SRCS-yes += arm/loopfilter_mb_neon.c
|
||||
DSP_SRCS-yes += arm/loopfilter_16_neon.c
|
||||
DSP_SRCS-yes += arm/loopfilter_8_neon.c
|
||||
DSP_SRCS-yes += arm/loopfilter_4_neon.c
|
||||
endif # HAVE_NEON
|
||||
endif # HAVE_NEON_ASM
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user