smart_yuv: switch to planar instead of packed r/g/b processing

avoiding triplets of data should make it easier to write SSE2 versions.

FilterRow() can now filter all input in one single pass
-> conversion is 15-20% faster (but still overall slow compared to -pre 0)

Change-Id: I14c3215e672fdecde7ec80394e814bdc7445019f
This commit is contained in:
Pascal Massimino 2016-11-15 14:51:34 +01:00
parent c0699515af
commit 1d6e7bf39f

View File

@ -266,55 +266,61 @@ static int ScaleDown(int a, int b, int c, int d) {
return LinearToGammaF(0.25f * (A + B + C + D));
}
static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int len) {
while (len-- > 0) {
const float R = GammaToLinearF(src[0]);
const float G = GammaToLinearF(src[1]);
const float B = GammaToLinearF(src[2]);
static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
int i;
for (i = 0; i < w; ++i) {
const float R = GammaToLinearF(src[0 * w + i]);
const float G = GammaToLinearF(src[1 * w + i]);
const float B = GammaToLinearF(src[2 * w + i]);
const float Y = RGBToGrayF(R, G, B);
*dst++ = (fixed_y_t)LinearToGammaF(Y);
src += 3;
dst[i] = (fixed_y_t)LinearToGammaF(Y);
}
}
static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
fixed_t* dst, int len) {
while (len--> 0) {
const int r = ScaleDown(src1[0], src1[3], src2[0], src2[3]);
const int g = ScaleDown(src1[1], src1[4], src2[1], src2[4]);
const int b = ScaleDown(src1[2], src1[5], src2[2], src2[5]);
fixed_t* dst, int uv_w) {
int i;
for (i = 0; i < uv_w; ++i) {
const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
const int W = RGBToGray(r, g, b);
dst[0] = (fixed_t)(r - W);
dst[1] = (fixed_t)(g - W);
dst[2] = (fixed_t)(b - W);
dst += 3;
src1 += 6;
src2 += 6;
dst[0 * uv_w] = (fixed_t)(r - W);
dst[1 * uv_w] = (fixed_t)(g - W);
dst[2 * uv_w] = (fixed_t)(b - W);
dst += 1;
src1 += 2;
src2 += 2;
}
}
static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int len) {
static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
int i;
for (i = 0; i < len; ++i) {
y[i] = RGBToGray(rgb[0], rgb[1], rgb[2]);
rgb += 3;
for (i = 0; i < w; ++i) {
y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
}
}
//------------------------------------------------------------------------------
static WEBP_INLINE int Filter(const fixed_t* const A, const fixed_t* const B,
int rightwise) {
int v;
if (!rightwise) {
v = (A[0] * 9 + A[-3] * 3 + B[0] * 3 + B[-3]);
} else {
v = (A[0] * 9 + A[+3] * 3 + B[0] * 3 + B[+3]);
static void FilterRow(const fixed_t* A, const fixed_t* B, int len,
const fixed_y_t* best_y, fixed_y_t* out) {
int i;
for (i = 0; i < len; ++i, ++A, ++B) {
const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
}
return (v + 8) >> 4;
}
static WEBP_INLINE int Filter2(int A, int B) { return (A * 3 + B + 2) >> 2; }
static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) {
const int v0 = (A * 3 + B + 2) >> 2;
return clip_y(v0 + W0);
}
//------------------------------------------------------------------------------
@ -329,52 +335,49 @@ static void ImportOneRow(const uint8_t* const r_ptr,
int pic_width,
fixed_y_t* const dst) {
int i;
const int w = (pic_width + 1) & ~1;
for (i = 0; i < pic_width; ++i) {
const int off = i * step;
dst[3 * i + 0] = UpLift(r_ptr[off]);
dst[3 * i + 1] = UpLift(g_ptr[off]);
dst[3 * i + 2] = UpLift(b_ptr[off]);
dst[i + 0 * w] = UpLift(r_ptr[off]);
dst[i + 1 * w] = UpLift(g_ptr[off]);
dst[i + 2 * w] = UpLift(b_ptr[off]);
}
if (pic_width & 1) { // replicate rightmost pixel
memcpy(dst + 3 * pic_width, dst + 3 * (pic_width - 1), 3 * sizeof(*dst));
dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
}
}
static void InterpolateTwoRows(const fixed_y_t* const best_y,
const fixed_t* const prev_uv,
const fixed_t* const cur_uv,
const fixed_t* const next_uv,
const fixed_t* prev_uv,
const fixed_t* cur_uv,
const fixed_t* next_uv,
int w,
fixed_y_t* const out1,
fixed_y_t* const out2) {
int i, k;
{ // special boundary case for i==0
const int W0 = best_y[0];
const int W1 = best_y[w];
for (k = 0; k <= 2; ++k) {
out1[k] = clip_y(Filter2(cur_uv[k], prev_uv[k]) + W0);
out2[k] = clip_y(Filter2(cur_uv[k], next_uv[k]) + W1);
}
}
for (i = 1; i < w - 1; ++i) {
const int W0 = best_y[i + 0];
const int W1 = best_y[i + w];
const int off = 3 * (i >> 1);
for (k = 0; k <= 2; ++k) {
const int tmp0 = Filter(cur_uv + off + k, prev_uv + off + k, i & 1);
const int tmp1 = Filter(cur_uv + off + k, next_uv + off + k, i & 1);
out1[3 * i + k] = clip_y(tmp0 + W0);
out2[3 * i + k] = clip_y(tmp1 + W1);
}
}
{ // special boundary case for i == w - 1
const int W0 = best_y[i + 0];
const int W1 = best_y[i + w];
const int off = 3 * (i >> 1);
for (k = 0; k <= 2; ++k) {
out1[3 * i + k] = clip_y(Filter2(cur_uv[off + k], prev_uv[off + k]) + W0);
out2[3 * i + k] = clip_y(Filter2(cur_uv[off + k], next_uv[off + k]) + W1);
fixed_y_t* out1,
fixed_y_t* out2) {
const int uv_w = w >> 1;
int k = 3;
while (k-- > 0) { // process each R/G/B segments in turn
// special boundary case for i==0
out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
FilterRow(cur_uv, prev_uv, (w - 1) >> 1, best_y + 0 + 1, out1 + 1);
FilterRow(cur_uv, next_uv, (w - 1) >> 1, best_y + w + 1, out2 + 1);
// special boundary case for i == w - 1 when w is even
if (!(w & 1)) {
out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
best_y[w - 1 + 0]);
out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
best_y[w - 1 + w]);
}
out1 += w;
out2 += w;
prev_uv += uv_w;
cur_uv += uv_w;
next_uv += uv_w;
}
}
@ -406,11 +409,11 @@ static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
const int uv_h = h >> 1;
for (best_uv = best_uv_base, j = 0; j < picture->height; ++j) {
for (i = 0; i < picture->width; ++i) {
const int off = 3 * (i >> 1);
const int off = (i >> 1);
const int W = best_y[i];
const int r = best_uv[off + 0] + W;
const int g = best_uv[off + 1] + W;
const int b = best_uv[off + 2] + W;
const int r = best_uv[off + 0 * uv_w] + W;
const int g = best_uv[off + 1 * uv_w] + W;
const int b = best_uv[off + 2 * uv_w] + W;
dst_y[i] = ConvertRGBToY(r, g, b);
}
best_y += w;
@ -419,10 +422,10 @@ static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
}
for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
for (i = 0; i < uv_w; ++i) {
const int off = 3 * i;
const int r = best_uv[off + 0];
const int g = best_uv[off + 1];
const int b = best_uv[off + 2];
const int off = i;
const int r = best_uv[off + 0 * uv_w];
const int g = best_uv[off + 1 * uv_w];
const int b = best_uv[off + 2 * uv_w];
dst_u[i] = ConvertRGBToU(r, g, b);
dst_v[i] = ConvertRGBToV(r, g, b);
}
@ -480,7 +483,7 @@ static int PreprocessARGB(const uint8_t* r_ptr,
// Import RGB samples to W/RGB representation.
for (j = 0; j < picture->height; j += 2) {
const int is_last_row = (j == picture->height - 1);
fixed_y_t* const src1 = tmp_buffer;
fixed_y_t* const src1 = tmp_buffer + 0 * w;
fixed_y_t* const src2 = tmp_buffer + 3 * w;
// prepare two rows of input
@ -491,7 +494,8 @@ static int PreprocessARGB(const uint8_t* r_ptr,
} else {
memcpy(src2, src1, 3 * w * sizeof(*src2));
}
StoreGray(src1, best_y, 2 * w); // convert two lines at a time
StoreGray(src1, best_y + 0, w);
StoreGray(src2, best_y + w, w);
UpdateW(src1, target_y, w);
UpdateW(src2, target_y + w, w);
@ -517,7 +521,7 @@ static int PreprocessARGB(const uint8_t* r_ptr,
target_y = target_y_base;
target_uv = target_uv_base;
for (j = 0; j < h; j += 2) {
fixed_y_t* const src1 = tmp_buffer;
fixed_y_t* const src1 = tmp_buffer + 0 * w;
fixed_y_t* const src2 = tmp_buffer + 3 * w;
{
const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);