Merge "5-7% faster SSE2 versions of YUV->RGB conversion functions"
This commit is contained in:
commit
d50c7e3275
@ -27,6 +27,9 @@ extern "C" {
|
||||
|
||||
#ifdef FANCY_UPSAMPLING
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// U/V upsampling
|
||||
|
||||
// Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
|
||||
#define UPSAMPLE_16PIXELS(r1, r2, out) { \
|
||||
uint8x8_t a = vld1_u8(r1); \
|
||||
@ -85,105 +88,74 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
|
||||
Upsample16Pixels(r1, r2, out); \
|
||||
}
|
||||
|
||||
#define CY 76283
|
||||
#define CVR 89858
|
||||
#define CUG 22014
|
||||
#define CVG 45773
|
||||
#define CUB 113618
|
||||
//-----------------------------------------------------------------------------
|
||||
// YUV->RGB conversion
|
||||
|
||||
static const int16_t coef[4] = { CVR / 4, CUG, CVG / 2, CUB / 4 };
|
||||
|
||||
#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) { \
|
||||
int i; \
|
||||
for (i = 0; i < N; i += 8) { \
|
||||
int off = ((cur_x) + i) * XSTEP; \
|
||||
uint8x8_t y = vld1_u8(src_y + (cur_x) + i); \
|
||||
uint8x8_t u = vld1_u8((src_uv) + i); \
|
||||
uint8x8_t v = vld1_u8((src_uv) + i + 16); \
|
||||
int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); \
|
||||
int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); \
|
||||
int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); \
|
||||
\
|
||||
int16x8_t ud = vshlq_n_s16(uu, 1); \
|
||||
int16x8_t vd = vshlq_n_s16(vv, 1); \
|
||||
\
|
||||
int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), \
|
||||
vget_low_s16(vd), cf16, 0); \
|
||||
int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), \
|
||||
vget_high_s16(vd), cf16, 0); \
|
||||
int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), \
|
||||
vrshrn_n_s32(vrh, 16)); \
|
||||
\
|
||||
int32x4_t vl = vmovl_s16(vget_low_s16(vv)); \
|
||||
int32x4_t vh = vmovl_s16(vget_high_s16(vv)); \
|
||||
int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); \
|
||||
int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); \
|
||||
int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); \
|
||||
int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); \
|
||||
int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), \
|
||||
vrshrn_n_s32(gch, 16)); \
|
||||
\
|
||||
int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), \
|
||||
vget_low_s16(ud), cf16, 3); \
|
||||
int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), \
|
||||
vget_high_s16(ud), cf16, 3); \
|
||||
int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), \
|
||||
vrshrn_n_s32(ubh, 16)); \
|
||||
\
|
||||
int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); \
|
||||
int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); \
|
||||
int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); \
|
||||
int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); \
|
||||
int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); \
|
||||
int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); \
|
||||
\
|
||||
rl = vmulq_lane_s32(rl, cf32, 0); \
|
||||
rh = vmulq_lane_s32(rh, cf32, 0); \
|
||||
gl = vmulq_lane_s32(gl, cf32, 0); \
|
||||
gh = vmulq_lane_s32(gh, cf32, 0); \
|
||||
bl = vmulq_lane_s32(bl, cf32, 0); \
|
||||
bh = vmulq_lane_s32(bh, cf32, 0); \
|
||||
\
|
||||
y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), \
|
||||
vrshrn_n_s32(rh, 16))); \
|
||||
u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), \
|
||||
vrshrn_n_s32(gh, 16))); \
|
||||
v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), \
|
||||
vrshrn_n_s32(bh, 16))); \
|
||||
STR_ ## FMT(out + off, y, u, v); \
|
||||
} \
|
||||
}
|
||||
static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
|
||||
|
||||
#define v255 vmov_n_u8(255)
|
||||
|
||||
#define STR_Rgb(out, r, g, b) do { \
|
||||
#define STORE_Rgb(out, r, g, b) do { \
|
||||
const uint8x8x3_t r_g_b = {{ r, g, b }}; \
|
||||
vst3_u8(out, r_g_b); \
|
||||
} while (0)
|
||||
|
||||
#define STR_Bgr(out, r, g, b) do { \
|
||||
#define STORE_Bgr(out, r, g, b) do { \
|
||||
const uint8x8x3_t b_g_r = {{ b, g, r }}; \
|
||||
vst3_u8(out, b_g_r); \
|
||||
} while (0)
|
||||
|
||||
#define STR_Rgba(out, r, g, b) do { \
|
||||
#define STORE_Rgba(out, r, g, b) do { \
|
||||
const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }}; \
|
||||
vst4_u8(out, r_g_b_v255); \
|
||||
} while (0)
|
||||
|
||||
#define STR_Bgra(out, r, g, b) do { \
|
||||
#define STORE_Bgra(out, r, g, b) do { \
|
||||
const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }}; \
|
||||
vst4_u8(out, b_g_r_v255); \
|
||||
} while (0)
|
||||
|
||||
#define CONVERT1(FMT, XSTEP, N, src_y, src_uv, rgb, cur_x) { \
|
||||
#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) { \
|
||||
int i; \
|
||||
for (i = 0; i < N; i += 8) { \
|
||||
const int off = ((cur_x) + i) * XSTEP; \
|
||||
uint8x8_t y = vld1_u8((src_y) + (cur_x) + i); \
|
||||
uint8x8_t u = vld1_u8((src_uv) + i); \
|
||||
uint8x8_t v = vld1_u8((src_uv) + i + 16); \
|
||||
const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); \
|
||||
const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); \
|
||||
const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); \
|
||||
int32x4_t yl = vmull_lane_s16(vget_low_s16(yy), cf16, 0); \
|
||||
int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0); \
|
||||
const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv), cf16, 1);\
|
||||
const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\
|
||||
int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu), cf16, 2); \
|
||||
int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2); \
|
||||
const int32x4_t bl = vmovl_s16(vget_low_s16(uu)); \
|
||||
const int32x4_t bh = vmovl_s16(vget_high_s16(uu)); \
|
||||
gl = vmlsl_lane_s16(gl, vget_low_s16(vv), cf16, 3); \
|
||||
gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3); \
|
||||
yl = vmlaq_lane_s32(yl, bl, cf32, 0); \
|
||||
yh = vmlaq_lane_s32(yh, bh, cf32, 0); \
|
||||
/* vrshrn_n_s32() already incorporates the rounding constant */ \
|
||||
y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2), \
|
||||
vrshrn_n_s32(rh, YUV_FIX2))); \
|
||||
u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2), \
|
||||
vrshrn_n_s32(gh, YUV_FIX2))); \
|
||||
v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2), \
|
||||
vrshrn_n_s32(yh, YUV_FIX2))); \
|
||||
STORE_ ## FMT(out + off, y, u, v); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) { \
|
||||
int i; \
|
||||
for (i = 0; i < N; i++) { \
|
||||
int off = ((cur_x) + i) * XSTEP; \
|
||||
int y = src_y[(cur_x) + i]; \
|
||||
int u = (src_uv)[i]; \
|
||||
int v = (src_uv)[i + 16]; \
|
||||
VP8YuvTo ## FMT(y, u, v, rgb + off); \
|
||||
const int off = ((cur_x) + i) * XSTEP; \
|
||||
const int y = src_y[(cur_x) + i]; \
|
||||
const int u = (src_uv)[i]; \
|
||||
const int v = (src_uv)[i + 16]; \
|
||||
FUNC(y, u, v, rgb + off); \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -195,11 +167,11 @@ static const int16_t coef[4] = { CVR / 4, CUG, CVG / 2, CUB / 4 };
|
||||
} \
|
||||
}
|
||||
|
||||
#define CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, uv, \
|
||||
#define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv, \
|
||||
top_dst, bottom_dst, cur_x, len) { \
|
||||
CONVERT1(FMT, XSTEP, len, top_y, uv, top_dst, cur_x); \
|
||||
CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x); \
|
||||
if (bottom_y != NULL) { \
|
||||
CONVERT1(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
|
||||
CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -221,8 +193,8 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \
|
||||
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
|
||||
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
|
||||
\
|
||||
const int16x4_t cf16 = vld1_s16(coef); \
|
||||
const int32x2_t cf32 = vmov_n_s32(CY); \
|
||||
const int16x4_t cf16 = vld1_s16(kCoeffs); \
|
||||
const int32x2_t cf32 = vmov_n_s32(kUToB); \
|
||||
const uint8x8_t u16 = vmov_n_u8(16); \
|
||||
const uint8x8_t u128 = vmov_n_u8(128); \
|
||||
\
|
||||
@ -252,7 +224,7 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \
|
||||
\
|
||||
UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv); \
|
||||
UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16); \
|
||||
CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, r_uv, \
|
||||
CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv, \
|
||||
top_dst, bottom_dst, last_pos, len - last_pos); \
|
||||
}
|
||||
|
||||
|
@ -87,8 +87,8 @@ extern "C" {
|
||||
GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \
|
||||
\
|
||||
/* pack the alternate pixels */ \
|
||||
PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]); \
|
||||
PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]); \
|
||||
PACK_AND_STORE(a, b, diag1, diag2, out + 0); /* store top */ \
|
||||
PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32); /* store bottom */ \
|
||||
}
|
||||
|
||||
// Turn the macro into a function for reducing code-size when non-critical
|
||||
@ -108,68 +108,68 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
|
||||
Upsample32Pixels(r1, r2, out); \
|
||||
}
|
||||
|
||||
#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv, \
|
||||
#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, \
|
||||
top_dst, bottom_dst, cur_x, num_pixels) { \
|
||||
int n; \
|
||||
for (n = 0; n < (num_pixels); ++n) { \
|
||||
FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n], \
|
||||
FUNC(top_y[(cur_x) + n], r_u[n], r_v[n], \
|
||||
top_dst + ((cur_x) + n) * XSTEP); \
|
||||
} \
|
||||
if (bottom_y != NULL) { \
|
||||
for (n = 0; n < (num_pixels); ++n) { \
|
||||
FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n], \
|
||||
FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n], \
|
||||
bottom_dst + ((cur_x) + n) * XSTEP); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \
|
||||
top_dst, bottom_dst, cur_x) do { \
|
||||
FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP); \
|
||||
if (bottom_y != NULL) { \
|
||||
FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64, \
|
||||
bottom_dst + (cur_x) * XSTEP); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
|
||||
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
|
||||
const uint8_t* top_u, const uint8_t* top_v, \
|
||||
const uint8_t* cur_u, const uint8_t* cur_v, \
|
||||
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
|
||||
int block; \
|
||||
/* 16 byte aligned array to cache reconstructed u and v */ \
|
||||
int uv_pos, pos; \
|
||||
/* 16byte-aligned array to cache reconstructed u and v */ \
|
||||
uint8_t uv_buf[4 * 32 + 15]; \
|
||||
uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
|
||||
const int uv_len = (len + 1) >> 1; \
|
||||
/* 17 pixels must be read-able for each block */ \
|
||||
const int num_blocks = (uv_len - 1) >> 4; \
|
||||
const int leftover = uv_len - num_blocks * 16; \
|
||||
const int last_pos = 1 + 32 * num_blocks; \
|
||||
uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
|
||||
uint8_t* const r_v = r_u + 32; \
|
||||
\
|
||||
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
|
||||
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
|
||||
\
|
||||
assert(len > 0); \
|
||||
/* Treat the first pixel in regular way */ \
|
||||
assert(top_y != NULL); \
|
||||
{ \
|
||||
const int u0 = (top_u[0] + u_diag) >> 1; \
|
||||
const int v0 = (top_v[0] + v_diag) >> 1; \
|
||||
FUNC(top_y[0], u0, v0, top_dst); \
|
||||
{ /* Treat the first pixel in regular way */ \
|
||||
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
|
||||
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
|
||||
const int u0_t = (top_u[0] + u_diag) >> 1; \
|
||||
const int v0_t = (top_v[0] + v_diag) >> 1; \
|
||||
FUNC(top_y[0], u0_t, v0_t, top_dst); \
|
||||
if (bottom_y != NULL) { \
|
||||
const int u0_b = (cur_u[0] + u_diag) >> 1; \
|
||||
const int v0_b = (cur_v[0] + v_diag) >> 1; \
|
||||
FUNC(bottom_y[0], u0_b, v0_b, bottom_dst); \
|
||||
} \
|
||||
} \
|
||||
if (bottom_y != NULL) { \
|
||||
const int u0 = (cur_u[0] + u_diag) >> 1; \
|
||||
const int v0 = (cur_v[0] + v_diag) >> 1; \
|
||||
FUNC(bottom_y[0], u0, v0, bottom_dst); \
|
||||
/* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */ \
|
||||
for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) { \
|
||||
UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u); \
|
||||
UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v); \
|
||||
CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos); \
|
||||
} \
|
||||
\
|
||||
for (block = 0; block < num_blocks; ++block) { \
|
||||
UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32); \
|
||||
UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32); \
|
||||
CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \
|
||||
32 * block + 1, 32) \
|
||||
top_u += 16; \
|
||||
cur_u += 16; \
|
||||
top_v += 16; \
|
||||
cur_v += 16; \
|
||||
if (len > 1) { \
|
||||
const int left_over = ((len + 1) >> 1) - (pos >> 1); \
|
||||
assert(left_over > 0); \
|
||||
UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \
|
||||
UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \
|
||||
CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, \
|
||||
pos, len - pos); \
|
||||
} \
|
||||
\
|
||||
UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32); \
|
||||
UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32); \
|
||||
CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \
|
||||
last_pos, len - last_pos); \
|
||||
}
|
||||
|
||||
// SSE2 variants of the fancy upsampler.
|
||||
@ -183,6 +183,7 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
|
||||
#undef UPSAMPLE_32PIXELS
|
||||
#undef UPSAMPLE_LAST_BLOCK
|
||||
#undef CONVERT2RGB
|
||||
#undef CONVERT2RGB_32
|
||||
#undef SSE2_UPSAMPLE_FUNC
|
||||
|
||||
#endif // FANCY_UPSAMPLING
|
||||
@ -197,6 +198,7 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
|
||||
|
||||
void WebPInitUpsamplersSSE2(void) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
VP8YUVInitSSE2();
|
||||
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePairSSE2;
|
||||
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
|
||||
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePairSSE2;
|
||||
@ -214,7 +216,7 @@ void WebPInitPremultiplySSE2(void) {
|
||||
#else
|
||||
|
||||
// this empty function is to avoid an empty .o
|
||||
void WebPInitPremultiplySSE2(void);
|
||||
void WebPInitPremultiplySSE2(void) {}
|
||||
|
||||
#endif // FANCY_UPSAMPLING
|
||||
|
||||
|
149
src/dsp/yuv.c
149
src/dsp/yuv.c
@ -17,12 +17,8 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef WEBP_YUV_USE_TABLE
|
||||
|
||||
int16_t VP8kVToR[256], VP8kUToB[256];
|
||||
int32_t VP8kVToG[256], VP8kUToG[256];
|
||||
uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
|
||||
uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
|
||||
#if defined(WEBP_YUV_USE_TABLE)
|
||||
|
||||
static int done = 0;
|
||||
|
||||
@ -30,6 +26,11 @@ static WEBP_INLINE uint8_t clip(int v, int max_value) {
|
||||
return v < 0 ? 0 : v > max_value ? max_value : v;
|
||||
}
|
||||
|
||||
int16_t VP8kVToR[256], VP8kUToB[256];
|
||||
int32_t VP8kVToG[256], VP8kUToG[256];
|
||||
uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
|
||||
uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
|
||||
|
||||
void VP8YUVInit(void) {
|
||||
int i;
|
||||
if (done) {
|
||||
@ -70,6 +71,144 @@ void VP8YUVInit(void) {}
|
||||
|
||||
#endif // WEBP_YUV_USE_TABLE
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// SSE2 extras
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
||||
#ifdef FANCY_UPSAMPLING
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <string.h> // for memcpy
|
||||
|
||||
typedef union { // handy struct for converting SSE2 registers
|
||||
int32_t i32[4];
|
||||
uint8_t u8[16];
|
||||
__m128i m;
|
||||
} VP8kCstSSE2;
|
||||
|
||||
static int done_sse2 = 0;
|
||||
static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256];
|
||||
|
||||
void VP8YUVInitSSE2(void) {
|
||||
if (!done_sse2) {
|
||||
int i;
|
||||
for (i = 0; i < 256; ++i) {
|
||||
VP8kYtoRGBA[i].i32[0] =
|
||||
VP8kYtoRGBA[i].i32[1] =
|
||||
VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2;
|
||||
VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2;
|
||||
|
||||
VP8kUtoRGBA[i].i32[0] = 0;
|
||||
VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128);
|
||||
VP8kUtoRGBA[i].i32[2] = kUToB * (i - 128);
|
||||
VP8kUtoRGBA[i].i32[3] = 0;
|
||||
|
||||
VP8kVtoRGBA[i].i32[0] = kVToR * (i - 128);
|
||||
VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128);
|
||||
VP8kVtoRGBA[i].i32[2] = 0;
|
||||
VP8kVtoRGBA[i].i32[3] = 0;
|
||||
}
|
||||
done_sse2 = 1;
|
||||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE __m128i VP8GetRGBA32b(int y, int u, int v) {
|
||||
const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
|
||||
const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
|
||||
const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
|
||||
const __m128i uv_part = _mm_add_epi32(u_part, v_part);
|
||||
const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
|
||||
const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
|
||||
return rgba2;
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VP8YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v,
|
||||
uint8_t* const rgb) {
|
||||
const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
|
||||
const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0);
|
||||
const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1);
|
||||
// Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
|
||||
_mm_storel_epi64((__m128i*)rgb, tmp2);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VP8YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v,
|
||||
uint8_t* const bgr) {
|
||||
const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
|
||||
const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2));
|
||||
const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1);
|
||||
const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2);
|
||||
// Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
|
||||
_mm_storel_epi64((__m128i*)bgr, tmp3);
|
||||
}
|
||||
|
||||
void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 4) {
|
||||
const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
|
||||
const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
|
||||
const __m128i tmp0_3 = VP8GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]);
|
||||
const __m128i tmp0_4 = VP8GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]);
|
||||
const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
|
||||
const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
|
||||
const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
|
||||
_mm_storeu_si128((__m128i*)dst, tmp2);
|
||||
dst += 4 * 4;
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 2) {
|
||||
const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
|
||||
const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
|
||||
const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
|
||||
const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
|
||||
const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
|
||||
const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
|
||||
_mm_storel_epi64((__m128i*)dst, tmp3);
|
||||
dst += 4 * 2;
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
int n;
|
||||
uint8_t tmp0[2 * 3 + 5 + 15];
|
||||
uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align
|
||||
for (n = 0; n < 30; ++n) { // we directly stomp the *dst memory
|
||||
VP8YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3);
|
||||
}
|
||||
// Last two pixels are special: we write in a tmp buffer before sending
|
||||
// to dst.
|
||||
VP8YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
|
||||
VP8YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
|
||||
memcpy(dst + n * 3, tmp, 2 * 3);
|
||||
}
|
||||
|
||||
void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
int n;
|
||||
uint8_t tmp0[2 * 3 + 5 + 15];
|
||||
uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align
|
||||
for (n = 0; n < 30; ++n) {
|
||||
VP8YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3);
|
||||
}
|
||||
VP8YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
|
||||
VP8YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
|
||||
memcpy(dst + n * 3, tmp, 2 * 3);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void VP8YUVInitSSE2(void) {}
|
||||
|
||||
#endif // FANCY_UPSAMPLING
|
||||
|
||||
#endif // WEBP_USE_SSE2
|
||||
|
||||
#if defined(__cplusplus) || defined(c_plusplus)
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
210
src/dsp/yuv.h
210
src/dsp/yuv.h
@ -14,7 +14,7 @@
|
||||
// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
|
||||
// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
|
||||
// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
|
||||
// We use 16bit fixed point operations for RGB->YUV conversion.
|
||||
// We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
|
||||
//
|
||||
// For the Y'CbCr to RGB conversion, the BT.601 specification reads:
|
||||
// R = 1.164 * (Y-16) + 1.596 * (V-128)
|
||||
@ -23,21 +23,24 @@
|
||||
// where Y is in the [16,235] range, and U/V in the [16,240] range.
|
||||
// In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor
|
||||
// "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table.
|
||||
// So in this case the formulae should be read as:
|
||||
// So in this case the formulae should read:
|
||||
// R = 1.164 * [Y + 1.371 * (V-128) ] - 18.624
|
||||
// G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624
|
||||
// B = 1.164 * [Y + 1.733 * (U-128)] - 18.624
|
||||
// once factorized. Here too, 16bit fixed precision is used.
|
||||
// once factorized.
|
||||
// For YUV->RGB conversion, only 14bit fixed precision is used (YUV_FIX2).
|
||||
// That's the maximum possible for a convenient ARM implementation.
|
||||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#ifndef WEBP_DSP_YUV_H_
|
||||
#define WEBP_DSP_YUV_H_
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "../dec/decode_vp8.h"
|
||||
|
||||
// Define the following to use the LUT-based code:
|
||||
#define WEBP_YUV_USE_TABLE
|
||||
// #define WEBP_YUV_USE_TABLE
|
||||
|
||||
#if defined(WEBP_EXPERIMENTAL_FEATURES)
|
||||
// Do NOT activate this feature for real compression. This is only experimental!
|
||||
@ -56,14 +59,100 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
enum { YUV_FIX = 16, // fixed-point precision
|
||||
enum {
|
||||
YUV_FIX = 16, // fixed-point precision for RGB->YUV
|
||||
YUV_HALF = 1 << (YUV_FIX - 1),
|
||||
YUV_MASK = (256 << YUV_FIX) - 1,
|
||||
YUV_RANGE_MIN = -227, // min value of r/g/b output
|
||||
YUV_RANGE_MAX = 256 + 226 // max value of r/g/b output
|
||||
YUV_RANGE_MAX = 256 + 226, // max value of r/g/b output
|
||||
|
||||
YUV_FIX2 = 14, // fixed-point precision for YUV->RGB
|
||||
YUV_HALF2 = 1 << (YUV_FIX2 - 1),
|
||||
YUV_MASK2 = (256 << YUV_FIX2) - 1
|
||||
};
|
||||
|
||||
#ifdef WEBP_YUV_USE_TABLE
|
||||
// These constants are 14b fixed-point version of ITU-R BT.601 constants.
|
||||
#define kYScale 19077 // 1.164 = 255 / 219
|
||||
#define kVToR 26149 // 1.596 = 255 / 112 * 0.701
|
||||
#define kUToG 6419 // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
|
||||
#define kVToG 13320 // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
|
||||
#define kUToB 33050 // 2.018 = 255 / 112 * 0.886
|
||||
#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF2)
|
||||
#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF2)
|
||||
#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF2)
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#ifndef WEBP_YUV_USE_TABLE
|
||||
|
||||
// slower on x86 by ~7-8%, but bit-exact with the SSE2 version
|
||||
|
||||
static WEBP_INLINE uint8_t VP8Clip8(int v) {
|
||||
return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : ((~v) >> 31);
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint8_t VP8YUVToR(int y, int v) {
|
||||
return VP8Clip8(kYScale * y + kVToR * v + kRCst);
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint8_t VP8YUVToG(int y, int u, int v) {
|
||||
return VP8Clip8(kYScale * y - kUToG * u - kVToG * v + kGCst);
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint8_t VP8YUVToB(int y, int u) {
|
||||
return VP8Clip8(kYScale * y + kUToB * u + kBCst);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
|
||||
uint8_t* const rgb) {
|
||||
rgb[0] = VP8YUVToR(y, v);
|
||||
rgb[1] = VP8YUVToG(y, u, v);
|
||||
rgb[2] = VP8YUVToB(y, u);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
|
||||
uint8_t* const bgr) {
|
||||
bgr[0] = VP8YUVToB(y, u);
|
||||
bgr[1] = VP8YUVToG(y, u, v);
|
||||
bgr[2] = VP8YUVToR(y, v);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
|
||||
uint8_t* const rgb) {
|
||||
const int r = VP8YUVToR(y, v); // 5 usable bits
|
||||
const int g = VP8YUVToG(y, u, v); // 6 usable bits
|
||||
const int b = VP8YUVToB(y, u); // 5 usable bits
|
||||
const int rg = (r & 0xf8) | (g >> 5);
|
||||
const int gb = ((g << 3) & 0xe0) | (b >> 3);
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
rgb[0] = gb;
|
||||
rgb[1] = rg;
|
||||
#else
|
||||
rgb[0] = rg;
|
||||
rgb[1] = gb;
|
||||
#endif
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
|
||||
uint8_t* const argb) {
|
||||
const int r = VP8YUVToR(y, v); // 4 usable bits
|
||||
const int g = VP8YUVToG(y, u, v); // 4 usable bits
|
||||
const int b = VP8YUVToB(y, u); // 4 usable bits
|
||||
const int rg = (r & 0xf0) | (g >> 4);
|
||||
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
argb[0] = ba;
|
||||
argb[1] = rg;
|
||||
#else
|
||||
argb[0] = rg;
|
||||
argb[1] = ba;
|
||||
#endif
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// Table-based version, not totally equivalent to the SSE2 version.
|
||||
// Rounding diff is only +/-1 though.
|
||||
|
||||
extern int16_t VP8kVToR[256], VP8kUToB[256];
|
||||
extern int32_t VP8kVToG[256], VP8kUToG[256];
|
||||
@ -125,88 +214,11 @@ static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
|
||||
#endif
|
||||
}
|
||||
|
||||
#else // Table-free version (slower on x86)
|
||||
|
||||
// These constants are 16b fixed-point version of ITU-R BT.601 constants
|
||||
#define kYScale 76309 // 1.164 = 255 / 219
|
||||
#define kVToR 104597 // 1.596 = 255 / 112 * 0.701
|
||||
#define kUToG 25674 // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
|
||||
#define kVToG 53278 // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
|
||||
#define kUToB 132201 // 2.018 = 255 / 112 * 0.886
|
||||
#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF)
|
||||
#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF)
|
||||
#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF)
|
||||
|
||||
static WEBP_INLINE uint8_t VP8Clip8(int v) {
|
||||
return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> YUV_FIX)
|
||||
: (v < 0) ? 0u : 255u;
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint8_t VP8ClipN(int v, int N) { // clip to N bits
|
||||
return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> (YUV_FIX + (8 - N)))
|
||||
: (v < 0) ? 0u : (255u >> (8 - N));
|
||||
}
|
||||
|
||||
static WEBP_INLINE int VP8YUVToR(int y, int v) {
|
||||
return kYScale * y + kVToR * v + kRCst;
|
||||
}
|
||||
|
||||
static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
|
||||
return kYScale * y - kUToG * u - kVToG * v + kGCst;
|
||||
}
|
||||
|
||||
static WEBP_INLINE int VP8YUVToB(int y, int u) {
|
||||
return kYScale * y + kUToB * u + kBCst;
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
|
||||
uint8_t* const rgb) {
|
||||
rgb[0] = VP8Clip8(VP8YUVToR(y, v));
|
||||
rgb[1] = VP8Clip8(VP8YUVToG(y, u, v));
|
||||
rgb[2] = VP8Clip8(VP8YUVToB(y, u));
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
|
||||
uint8_t* const bgr) {
|
||||
bgr[0] = VP8Clip8(VP8YUVToB(y, u));
|
||||
bgr[1] = VP8Clip8(VP8YUVToG(y, u, v));
|
||||
bgr[2] = VP8Clip8(VP8YUVToR(y, v));
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
|
||||
uint8_t* const rgb) {
|
||||
const int r = VP8Clip8(VP8YUVToR(y, u));
|
||||
const int g = VP8ClipN(VP8YUVToG(y, u, v), 6);
|
||||
const int b = VP8ClipN(VP8YUVToB(y, v), 5);
|
||||
const uint8_t rg = (r & 0xf8) | (g >> 3);
|
||||
const uint8_t gb = (g << 5) | b;
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
rgb[0] = gb;
|
||||
rgb[1] = rg;
|
||||
#else
|
||||
rgb[0] = rg;
|
||||
rgb[1] = gb;
|
||||
#endif
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
|
||||
uint8_t* const argb) {
|
||||
const int r = VP8Clip8(VP8YUVToR(y, u));
|
||||
const int g = VP8ClipN(VP8YUVToG(y, u, v), 4);
|
||||
const int b = VP8Clip8(VP8YUVToB(y, v));
|
||||
const uint8_t rg = (r & 0xf0) | g;
|
||||
const uint8_t ba = b | 0x0f; // overwrite the lower 4 bits
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
argb[0] = ba;
|
||||
argb[1] = rg;
|
||||
#else
|
||||
argb[0] = rg;
|
||||
argb[1] = ba;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // WEBP_YUV_USE_TABLE
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Alpha handling variants
|
||||
|
||||
static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
|
||||
uint8_t* const argb) {
|
||||
argb[0] = 0xff;
|
||||
@ -228,6 +240,28 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
|
||||
// Must be called before everything, to initialize the tables.
|
||||
void VP8YUVInit(void);
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// SSE2 extra functions (mostly for upsampling_sse2.c)
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
||||
#ifdef FANCY_UPSAMPLING
|
||||
// Process 32 pixels and store the result (24b or 32b per pixel) in *dst.
|
||||
extern void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
extern void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
extern void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
extern void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
#endif // FANCY_UPSAMPLING
|
||||
|
||||
// Must be called to initialize tables before using the functions.
|
||||
extern void VP8YUVInitSSE2(void);
|
||||
|
||||
#endif // WEBP_USE_SSE2
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// RGB -> YUV conversion
|
||||
|
||||
@ -257,7 +291,7 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
|
||||
#else
|
||||
|
||||
// This JPEG-YUV colorspace, only for comparison!
|
||||
// These are also 16-bit precision coefficients from Rec.601, but with full
|
||||
// These are also 16bit precision coefficients from Rec.601, but with full
|
||||
// [0..255] output range.
|
||||
static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
|
||||
const int kRound = (1 << (YUV_FIX - 1));
|
||||
|
Loading…
x
Reference in New Issue
Block a user