add light filtering NEON intrinsics
disabled for now (but tested OK), thanks to the USE_INTRINSICS #define We'll activate the code when we're on par with non-intrinsics Change-Id: Idbfb9cb01f4c7c9f5131b270f8c11b70d0d485ff
This commit is contained in:
parent
af44460880
commit
daccbf400d
@ -16,6 +16,8 @@
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
|
||||
// #define USE_INTRINSICS // use intrinsics when possible
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "../dec/vp8i.h"
|
||||
@ -68,6 +70,65 @@
|
||||
DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
|
||||
FLIP_SIGN_BIT2(p0, q0, q10)
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
|
||||
const uint8x16_t q0, const uint8x16_t q1,
|
||||
int thresh) {
|
||||
const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
|
||||
const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
|
||||
const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)
|
||||
const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)
|
||||
const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2
|
||||
const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
|
||||
const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
|
||||
return mask;
|
||||
}
|
||||
|
||||
static int8x16_t FlipSign(const uint8x16_t v) {
|
||||
const uint8x16_t sign_bit = vdupq_n_u8(0x80);
|
||||
return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
|
||||
}
|
||||
|
||||
static uint8x16_t FlipSignBack(const int8x16_t v) {
|
||||
const int8x16_t sign_bit = vdupq_n_s8(0x80);
|
||||
return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
|
||||
}
|
||||
|
||||
static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
|
||||
const int8x16_t q0, const int8x16_t q1) {
|
||||
const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
|
||||
const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)
|
||||
const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)
|
||||
const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)
|
||||
const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)
|
||||
return s3;
|
||||
}
|
||||
|
||||
static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
|
||||
const uint8x16_t q0, const uint8x16_t q1,
|
||||
uint8x16_t* const op0, uint8x16_t* const oq0,
|
||||
int thresh) {
|
||||
const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
|
||||
const int8x16_t p1s = FlipSign(p1);
|
||||
const int8x16_t p0s = FlipSign(p0);
|
||||
const int8x16_t q0s = FlipSign(q0);
|
||||
const int8x16_t q1s = FlipSign(q1);
|
||||
const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
|
||||
const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
|
||||
// DoSimpleFilter:
|
||||
const int8x16_t kCst3 = vdupq_n_s8(0x03);
|
||||
const int8x16_t kCst4 = vdupq_n_s8(0x04);
|
||||
const int8x16_t delta_p3 = vqaddq_s8(delta1, kCst3);
|
||||
const int8x16_t delta_p4 = vqaddq_s8(delta1, kCst4);
|
||||
const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
|
||||
const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
|
||||
const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
|
||||
const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
|
||||
*op0 = FlipSignBack(sp0);
|
||||
*oq0 = FlipSignBack(sq0);
|
||||
}
|
||||
#endif // USE_INTRINSICS
|
||||
|
||||
// Load/Store vertical edge
|
||||
#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
|
||||
"vld4.8 {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
|
||||
@ -113,6 +174,7 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
|
||||
// Simple In-loop filtering (Paragraph 15.2)
|
||||
|
||||
static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
|
||||
#if !defined(USE_INTRINSICS)
|
||||
__asm__ volatile (
|
||||
"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
|
||||
|
||||
@ -131,6 +193,16 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
|
||||
: [stride] "r"(stride), [thresh] "r"(thresh)
|
||||
: "memory", QRegs
|
||||
);
|
||||
#else
|
||||
const uint8x16_t p1 = vld1q_u8(p - 2 * stride);
|
||||
const uint8x16_t p0 = vld1q_u8(p - 1 * stride);
|
||||
const uint8x16_t q0 = vld1q_u8(p + 0 * stride);
|
||||
const uint8x16_t q1 = vld1q_u8(p + 1 * stride);
|
||||
uint8x16_t oq0, op0;
|
||||
DoFilter2(p1, p0, q0, q1, &op0, &oq0, thresh);
|
||||
vst1q_u8(p - stride, op0);
|
||||
vst1q_u8(p, oq0);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
|
||||
|
Loading…
Reference in New Issue
Block a user