diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c index 0604aa7d..a7866af8 100644 --- a/src/dsp/dec_neon.c +++ b/src/dsp/dec_neon.c @@ -16,6 +16,8 @@ #if defined(WEBP_USE_NEON) +// #define USE_INTRINSICS // use intrinsics when possible + #include #include "../dec/vp8i.h" @@ -68,6 +70,65 @@ DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \ FLIP_SIGN_BIT2(p0, q0, q10) +#if defined(USE_INTRINSICS) +static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + int thresh) { + const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh); + const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0) + const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1) + const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0) + const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2 + const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2); + const uint8x16_t mask = vcgeq_u8(thresh_v, sum); + return mask; +} + +static int8x16_t FlipSign(const uint8x16_t v) { + const uint8x16_t sign_bit = vdupq_n_u8(0x80); + return vreinterpretq_s8_u8(veorq_u8(v, sign_bit)); +} + +static uint8x16_t FlipSignBack(const int8x16_t v) { + const int8x16_t sign_bit = vdupq_n_s8(0x80); + return vreinterpretq_u8_s8(veorq_s8(v, sign_bit)); +} + +static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0, + const int8x16_t q0, const int8x16_t q1) { + const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) + const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1) + const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0) + const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0) + const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0) + return s3; +} + +static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + uint8x16_t* const op0, uint8x16_t* const oq0, + int thresh) { + const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh); + const int8x16_t p1s = FlipSign(p1); + const int8x16_t p0s = FlipSign(p0); + const int8x16_t q0s = FlipSign(q0); + const int8x16_t q1s = FlipSign(q1); + const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); + const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask)); + // DoSimpleFilter: + const int8x16_t kCst3 = vdupq_n_s8(0x03); + const int8x16_t kCst4 = vdupq_n_s8(0x04); + const int8x16_t delta_p3 = vqaddq_s8(delta1, kCst3); + const int8x16_t delta_p4 = vqaddq_s8(delta1, kCst4); + const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3); + const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); + const int8x16_t sp0 = vqaddq_s8(p0s, delta3); + const int8x16_t sq0 = vqsubq_s8(q0s, delta4); + *op0 = FlipSignBack(sp0); + *oq0 = FlipSignBack(sq0); +} +#endif // USE_INTRINSICS + // Load/Store vertical edge #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ "vld4.8 {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \ @@ -113,6 +174,7 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, // Simple In-loop filtering (Paragraph 15.2) static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) { +#if !defined(USE_INTRINSICS) __asm__ volatile ( "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride @@ -131,6 +193,16 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) { : [stride] "r"(stride), [thresh] "r"(thresh) : "memory", QRegs ); +#else + const uint8x16_t p1 = vld1q_u8(p - 2 * stride); + const uint8x16_t p0 = vld1q_u8(p - 1 * stride); + const uint8x16_t q0 = vld1q_u8(p + 0 * stride); + const uint8x16_t q1 = vld1q_u8(p + 1 * stride); + uint8x16_t oq0, op0; + DoFilter2(p1, p0, q0, q1, &op0, &oq0, thresh); + vst1q_u8(p - stride, op0); + vst1q_u8(p, oq0); +#endif } static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {