Merge "strong filtering speed-up (~2-3% x86, ~1-2% for NEON)"
This commit is contained in:
commit
9754d39a4e
@ -620,16 +620,16 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
|
||||
#endif // USE_INTRINSICS
|
||||
|
||||
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
uint32_t k;
|
||||
for (k = 3; k != 0; --k) {
|
||||
p += 4 * stride;
|
||||
SimpleVFilter16(p, stride, thresh);
|
||||
}
|
||||
}
|
||||
|
||||
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
uint32_t k;
|
||||
for (k = 3; k != 0; --k) {
|
||||
p += 4;
|
||||
SimpleHFilter16(p, stride, thresh);
|
||||
}
|
||||
@ -845,18 +845,23 @@ static void HFilter16(uint8_t* p, int stride,
|
||||
// on three inner edges
|
||||
static void VFilter16i(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
uint32_t k;
|
||||
uint8x16_t p3, p2, p1, p0;
|
||||
Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
|
||||
for (k = 3; k != 0; --k) {
|
||||
uint8x16_t q0, q1, q2, q3;
|
||||
p += 4 * stride;
|
||||
Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
|
||||
{
|
||||
const uint8x16_t mask =
|
||||
NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
|
||||
const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
|
||||
uint8x16_t op1, op0, oq0, oq1;
|
||||
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
|
||||
Store16x4(op1, op0, oq0, oq1, p, stride);
|
||||
// p3 and p2 are not just temporary variables here: they will be
|
||||
// re-used for next span. And q2/q3 will become p1/p0 accordingly.
|
||||
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
|
||||
Store16x4(p1, p0, p3, p2, p, stride);
|
||||
p1 = q2;
|
||||
p0 = q3;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -864,18 +869,21 @@ static void VFilter16i(uint8_t* p, int stride,
|
||||
#if !defined(WORK_AROUND_GCC)
|
||||
static void HFilter16i(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
uint32_t k;
|
||||
uint8x16_t p3, p2, p1, p0;
|
||||
Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
|
||||
for (k = 3; k != 0; --k) {
|
||||
uint8x16_t q0, q1, q2, q3;
|
||||
p += 4;
|
||||
Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
|
||||
{
|
||||
const uint8x16_t mask =
|
||||
NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
|
||||
const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
|
||||
uint8x16_t op1, op0, oq0, oq1;
|
||||
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
|
||||
Store4x16(op1, op0, oq0, oq1, p, stride);
|
||||
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
|
||||
Store4x16(p1, p0, p3, p2, p, stride);
|
||||
p1 = q2;
|
||||
p0 = q3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -608,43 +608,45 @@ static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
|
||||
}
|
||||
|
||||
// Transpose back and store
|
||||
static WEBP_INLINE void Store16x4(__m128i* const p1, __m128i* const p0,
|
||||
__m128i* const q0, __m128i* const q1,
|
||||
static WEBP_INLINE void Store16x4(const __m128i* const p1,
|
||||
const __m128i* const p0,
|
||||
const __m128i* const q0,
|
||||
const __m128i* const q1,
|
||||
uint8_t* r0, uint8_t* r8,
|
||||
int stride) {
|
||||
__m128i t1;
|
||||
__m128i t1, p1_s, p0_s, q0_s, q1_s;
|
||||
|
||||
// p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
|
||||
// p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
|
||||
t1 = *p0;
|
||||
*p0 = _mm_unpacklo_epi8(*p1, t1);
|
||||
*p1 = _mm_unpackhi_epi8(*p1, t1);
|
||||
p0_s = _mm_unpacklo_epi8(*p1, t1);
|
||||
p1_s = _mm_unpackhi_epi8(*p1, t1);
|
||||
|
||||
// q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
|
||||
// q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
|
||||
t1 = *q0;
|
||||
*q0 = _mm_unpacklo_epi8(t1, *q1);
|
||||
*q1 = _mm_unpackhi_epi8(t1, *q1);
|
||||
q0_s = _mm_unpacklo_epi8(t1, *q1);
|
||||
q1_s = _mm_unpackhi_epi8(t1, *q1);
|
||||
|
||||
// p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
|
||||
// q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
|
||||
t1 = *p0;
|
||||
*p0 = _mm_unpacklo_epi16(t1, *q0);
|
||||
*q0 = _mm_unpackhi_epi16(t1, *q0);
|
||||
t1 = p0_s;
|
||||
p0_s = _mm_unpacklo_epi16(t1, q0_s);
|
||||
q0_s = _mm_unpackhi_epi16(t1, q0_s);
|
||||
|
||||
// p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
|
||||
// q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
|
||||
t1 = *p1;
|
||||
*p1 = _mm_unpacklo_epi16(t1, *q1);
|
||||
*q1 = _mm_unpackhi_epi16(t1, *q1);
|
||||
t1 = p1_s;
|
||||
p1_s = _mm_unpacklo_epi16(t1, q1_s);
|
||||
q1_s = _mm_unpackhi_epi16(t1, q1_s);
|
||||
|
||||
Store4x4(p0, r0, stride);
|
||||
Store4x4(&p0_s, r0, stride);
|
||||
r0 += 4 * stride;
|
||||
Store4x4(q0, r0, stride);
|
||||
Store4x4(&q0_s, r0, stride);
|
||||
|
||||
Store4x4(p1, r8, stride);
|
||||
Store4x4(&p1_s, r8, stride);
|
||||
r8 += 4 * stride;
|
||||
Store4x4(q1, r8, stride);
|
||||
Store4x4(&q1_s, r8, stride);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@ -693,17 +695,17 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
|
||||
//------------------------------------------------------------------------------
|
||||
// Complex In-loop filtering (Paragraph 15.3)
|
||||
|
||||
#define MAX_DIFF1(p3, p2, p1, p0, m) { \
|
||||
m = MM_ABS(p3, p2); \
|
||||
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
|
||||
m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
|
||||
}
|
||||
|
||||
#define MAX_DIFF2(p3, p2, p1, p0, m) { \
|
||||
#define MAX_DIFF1(p3, p2, p1, p0, m) do { \
|
||||
m = MM_ABS(p1, p0); \
|
||||
m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
|
||||
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
|
||||
} while (0)
|
||||
|
||||
#define MAX_DIFF2(p3, p2, p1, p0, m) do { \
|
||||
m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
|
||||
}
|
||||
m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
|
||||
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
|
||||
} while (0)
|
||||
|
||||
#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \
|
||||
e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \
|
||||
@ -712,10 +714,11 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
|
||||
e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \
|
||||
}
|
||||
|
||||
#define LOADUV_H_EDGE(p, u, v, stride) { \
|
||||
p = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \
|
||||
p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&(v)[(stride)])); \
|
||||
}
|
||||
#define LOADUV_H_EDGE(p, u, v, stride) do { \
|
||||
const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \
|
||||
const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]); \
|
||||
p = _mm_unpacklo_epi64(U, V); \
|
||||
} while (0)
|
||||
|
||||
#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \
|
||||
LOADUV_H_EDGE(e1, u, v, 0 * stride); \
|
||||
@ -794,54 +797,61 @@ static void HFilter16(uint8_t* p, int stride,
|
||||
static void VFilter16i(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
int k;
|
||||
__m128i mask;
|
||||
__m128i t1, t2, p1, p0, q0, q1;
|
||||
__m128i p3, p2, p1, p0; // loop invariants
|
||||
|
||||
LOAD_H_EDGES4(p, stride, p3, p2, p1, p0); // prologue
|
||||
|
||||
for (k = 3; k > 0; --k) {
|
||||
// Load p3, p2, p1, p0
|
||||
LOAD_H_EDGES4(p, stride, t2, t1, p1, p0);
|
||||
MAX_DIFF1(t2, t1, p1, p0, mask);
|
||||
|
||||
__m128i mask, tmp1, tmp2;
|
||||
uint8_t* const b = p + 2 * stride; // beginning of p1
|
||||
p += 4 * stride;
|
||||
|
||||
// Load q0, q1, q2, q3
|
||||
LOAD_H_EDGES4(p, stride, q0, q1, t1, t2);
|
||||
MAX_DIFF2(t2, t1, q1, q0, mask);
|
||||
MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask
|
||||
LOAD_H_EDGES4(p, stride, p3, p2, tmp1, tmp2);
|
||||
MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
|
||||
|
||||
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
|
||||
// p3 and p2 are not just temporary variables here: they will be
|
||||
// re-used for next span. And q2/q3 will become p1/p0 accordingly.
|
||||
ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
|
||||
DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
|
||||
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
|
||||
_mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
|
||||
_mm_storeu_si128((__m128i*)&p[0 * stride], q0);
|
||||
_mm_storeu_si128((__m128i*)&p[1 * stride], q1);
|
||||
_mm_storeu_si128((__m128i*)&b[0 * stride], p1);
|
||||
_mm_storeu_si128((__m128i*)&b[1 * stride], p0);
|
||||
_mm_storeu_si128((__m128i*)&b[2 * stride], p3);
|
||||
_mm_storeu_si128((__m128i*)&b[3 * stride], p2);
|
||||
|
||||
// rotate samples
|
||||
p1 = tmp1;
|
||||
p0 = tmp2;
|
||||
}
|
||||
}
|
||||
|
||||
static void HFilter16i(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
int k;
|
||||
uint8_t* b;
|
||||
__m128i mask;
|
||||
__m128i t1, t2, p1, p0, q0, q1;
|
||||
__m128i p3, p2, p1, p0; // loop invariants
|
||||
|
||||
Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue
|
||||
|
||||
for (k = 3; k > 0; --k) {
|
||||
b = p;
|
||||
Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
|
||||
MAX_DIFF1(t2, t1, p1, p0, mask);
|
||||
__m128i mask, tmp1, tmp2;
|
||||
uint8_t* const b = p + 2; // beginning of p1
|
||||
|
||||
b += 4; // beginning of q0
|
||||
Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
|
||||
MAX_DIFF2(t2, t1, q1, q0, mask);
|
||||
p += 4; // beginning of q0 (and next span)
|
||||
|
||||
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
|
||||
MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask
|
||||
Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
|
||||
MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
|
||||
|
||||
b -= 2; // beginning of p1
|
||||
Store16x4(&p1, &p0, &q0, &q1, b, b + 8 * stride, stride);
|
||||
ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
|
||||
DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
|
||||
|
||||
p += 4;
|
||||
Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
|
||||
|
||||
// rotate samples
|
||||
p1 = tmp1;
|
||||
p0 = tmp2;
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user