diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c index 4ec11ec4..64d2b647 100644 --- a/src/dsp/dec_neon.c +++ b/src/dsp/dec_neon.c @@ -620,16 +620,16 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { #endif // USE_INTRINSICS static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { - int k; - for (k = 3; k > 0; --k) { + uint32_t k; + for (k = 3; k != 0; --k) { p += 4 * stride; SimpleVFilter16(p, stride, thresh); } } static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { - int k; - for (k = 3; k > 0; --k) { + uint32_t k; + for (k = 3; k != 0; --k) { p += 4; SimpleHFilter16(p, stride, thresh); } @@ -845,18 +845,23 @@ static void HFilter16(uint8_t* p, int stride, // on three inner edges static void VFilter16i(uint8_t* p, int stride, int thresh, int ithresh, int hev_thresh) { - int k; - for (k = 3; k > 0; --k) { - uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; + uint32_t k; + uint8x16_t p3, p2, p1, p0; + Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0); + for (k = 3; k != 0; --k) { + uint8x16_t q0, q1, q2, q3; p += 4 * stride; - Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3); { const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); - uint8x16_t op1, op0, oq0, oq1; - DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); - Store16x4(op1, op0, oq0, oq1, p, stride); + // p3 and p2 are not just temporary variables here: they will be + // re-used for next span. And q2/q3 will become p1/p0 accordingly. + DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); + Store16x4(p1, p0, p3, p2, p, stride); + p1 = q2; + p0 = q3; } } } @@ -864,18 +869,21 @@ static void VFilter16i(uint8_t* p, int stride, #if !defined(WORK_AROUND_GCC) static void HFilter16i(uint8_t* p, int stride, int thresh, int ithresh, int hev_thresh) { - int k; - for (k = 3; k > 0; --k) { - uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; + uint32_t k; + uint8x16_t p3, p2, p1, p0; + Load4x16(p + 2, stride, &p3, &p2, &p1, &p0); + for (k = 3; k != 0; --k) { + uint8x16_t q0, q1, q2, q3; p += 4; - Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load4x16(p + 2, stride, &q0, &q1, &q2, &q3); { const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); - uint8x16_t op1, op0, oq0, oq1; - DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); - Store4x16(op1, op0, oq0, oq1, p, stride); + DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); + Store4x16(p1, p0, p3, p2, p, stride); + p1 = q2; + p0 = q3; } } } diff --git a/src/dsp/dec_sse2.c b/src/dsp/dec_sse2.c index 8e8cff4e..c37a637f 100644 --- a/src/dsp/dec_sse2.c +++ b/src/dsp/dec_sse2.c @@ -608,43 +608,45 @@ static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) { } // Transpose back and store -static WEBP_INLINE void Store16x4(__m128i* const p1, __m128i* const p0, - __m128i* const q0, __m128i* const q1, +static WEBP_INLINE void Store16x4(const __m128i* const p1, + const __m128i* const p0, + const __m128i* const q0, + const __m128i* const q1, uint8_t* r0, uint8_t* r8, int stride) { - __m128i t1; + __m128i t1, p1_s, p0_s, q0_s, q1_s; // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 t1 = *p0; - *p0 = _mm_unpacklo_epi8(*p1, t1); - *p1 = _mm_unpackhi_epi8(*p1, t1); + p0_s = _mm_unpacklo_epi8(*p1, t1); + p1_s = _mm_unpackhi_epi8(*p1, t1); // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 t1 = *q0; - *q0 = _mm_unpacklo_epi8(t1, *q1); - *q1 = _mm_unpackhi_epi8(t1, *q1); + q0_s = _mm_unpacklo_epi8(t1, *q1); + q1_s = _mm_unpackhi_epi8(t1, *q1); // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 - t1 = *p0; - *p0 = _mm_unpacklo_epi16(t1, *q0); - *q0 = _mm_unpackhi_epi16(t1, *q0); + t1 = p0_s; + p0_s = _mm_unpacklo_epi16(t1, q0_s); + q0_s = _mm_unpackhi_epi16(t1, q0_s); // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 - t1 = *p1; - *p1 = _mm_unpacklo_epi16(t1, *q1); - *q1 = _mm_unpackhi_epi16(t1, *q1); + t1 = p1_s; + p1_s = _mm_unpacklo_epi16(t1, q1_s); + q1_s = _mm_unpackhi_epi16(t1, q1_s); - Store4x4(p0, r0, stride); + Store4x4(&p0_s, r0, stride); r0 += 4 * stride; - Store4x4(q0, r0, stride); + Store4x4(&q0_s, r0, stride); - Store4x4(p1, r8, stride); + Store4x4(&p1_s, r8, stride); r8 += 4 * stride; - Store4x4(q1, r8, stride); + Store4x4(&q1_s, r8, stride); } //------------------------------------------------------------------------------ @@ -693,17 +695,17 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { //------------------------------------------------------------------------------ // Complex In-loop filtering (Paragraph 15.3) -#define MAX_DIFF1(p3, p2, p1, p0, m) { \ - m = MM_ABS(p3, p2); \ - m = _mm_max_epu8(m, MM_ABS(p2, p1)); \ - m = _mm_max_epu8(m, MM_ABS(p1, p0)); \ -} - -#define MAX_DIFF2(p3, p2, p1, p0, m) { \ +#define MAX_DIFF1(p3, p2, p1, p0, m) do { \ + m = MM_ABS(p1, p0); \ m = _mm_max_epu8(m, MM_ABS(p3, p2)); \ m = _mm_max_epu8(m, MM_ABS(p2, p1)); \ +} while (0) + +#define MAX_DIFF2(p3, p2, p1, p0, m) do { \ m = _mm_max_epu8(m, MM_ABS(p1, p0)); \ -} + m = _mm_max_epu8(m, MM_ABS(p3, p2)); \ + m = _mm_max_epu8(m, MM_ABS(p2, p1)); \ +} while (0) #define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \ e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \ @@ -712,10 +714,11 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \ } -#define LOADUV_H_EDGE(p, u, v, stride) { \ - p = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \ - p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&(v)[(stride)])); \ -} +#define LOADUV_H_EDGE(p, u, v, stride) do { \ + const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \ + const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]); \ + p = _mm_unpacklo_epi64(U, V); \ +} while (0) #define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \ LOADUV_H_EDGE(e1, u, v, 0 * stride); \ @@ -794,54 +797,61 @@ static void HFilter16(uint8_t* p, int stride, static void VFilter16i(uint8_t* p, int stride, int thresh, int ithresh, int hev_thresh) { int k; - __m128i mask; - __m128i t1, t2, p1, p0, q0, q1; + __m128i p3, p2, p1, p0; // loop invariants + + LOAD_H_EDGES4(p, stride, p3, p2, p1, p0); // prologue for (k = 3; k > 0; --k) { - // Load p3, p2, p1, p0 - LOAD_H_EDGES4(p, stride, t2, t1, p1, p0); - MAX_DIFF1(t2, t1, p1, p0, mask); - + __m128i mask, tmp1, tmp2; + uint8_t* const b = p + 2 * stride; // beginning of p1 p += 4 * stride; - // Load q0, q1, q2, q3 - LOAD_H_EDGES4(p, stride, q0, q1, t1, t2); - MAX_DIFF2(t2, t1, q1, q0, mask); + MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask + LOAD_H_EDGES4(p, stride, p3, p2, tmp1, tmp2); + MAX_DIFF2(p3, p2, tmp1, tmp2, mask); - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + // p3 and p2 are not just temporary variables here: they will be + // re-used for next span. And q2/q3 will become p1/p0 accordingly. + ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask); + DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh); // Store - _mm_storeu_si128((__m128i*)&p[-2 * stride], p1); - _mm_storeu_si128((__m128i*)&p[-1 * stride], p0); - _mm_storeu_si128((__m128i*)&p[0 * stride], q0); - _mm_storeu_si128((__m128i*)&p[1 * stride], q1); + _mm_storeu_si128((__m128i*)&b[0 * stride], p1); + _mm_storeu_si128((__m128i*)&b[1 * stride], p0); + _mm_storeu_si128((__m128i*)&b[2 * stride], p3); + _mm_storeu_si128((__m128i*)&b[3 * stride], p2); + + // rotate samples + p1 = tmp1; + p0 = tmp2; } } static void HFilter16i(uint8_t* p, int stride, int thresh, int ithresh, int hev_thresh) { int k; - uint8_t* b; - __m128i mask; - __m128i t1, t2, p1, p0, q0, q1; + __m128i p3, p2, p1, p0; // loop invariants + + Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue for (k = 3; k > 0; --k) { - b = p; - Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0 - MAX_DIFF1(t2, t1, p1, p0, mask); + __m128i mask, tmp1, tmp2; + uint8_t* const b = p + 2; // beginning of p1 - b += 4; // beginning of q0 - Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3 - MAX_DIFF2(t2, t1, q1, q0, mask); + p += 4; // beginning of q0 (and next span) - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask + Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2); + MAX_DIFF2(p3, p2, tmp1, tmp2, mask); - b -= 2; // beginning of p1 - Store16x4(&p1, &p0, &q0, &q1, b, b + 8 * stride, stride); + ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask); + DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh); - p += 4; + Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride); + + // rotate samples + p1 = tmp1; + p0 = tmp2; } }