From df6cebfa9ebe47fbfb657696722786b5cacc6ace Mon Sep 17 00:00:00 2001 From: skal Date: Mon, 19 Aug 2013 12:43:53 -0700 Subject: [PATCH] 5-7% faster SSE2 versions of YUV->RGB conversion functions The C-version gets ~7-8% slower in order to match the SSE2 output exactly. The old (now off-by-1) code is kept under the WEBP_YUV_USE_TABLE flag for reference. (note that calc rounding precision is slightly better ~= +0.02dB) on ARM-neon, we somehow recover the ~4% speed that was lost by mimicking the initial C-version (see https://gerrit.chromium.org/gerrit/#/c/41610) Change-Id: Ia4363c5ed9b4c9edff5d932b002e57bb7814bf6f --- src/dsp/upsampling_neon.c | 138 ++++++++++--------------- src/dsp/upsampling_sse2.c | 86 ++++++++-------- src/dsp/yuv.c | 149 ++++++++++++++++++++++++++- src/dsp/yuv.h | 210 ++++++++++++++++++++++---------------- 4 files changed, 365 insertions(+), 218 deletions(-) diff --git a/src/dsp/upsampling_neon.c b/src/dsp/upsampling_neon.c index 0b64ed86..f7450932 100644 --- a/src/dsp/upsampling_neon.c +++ b/src/dsp/upsampling_neon.c @@ -27,6 +27,9 @@ extern "C" { #ifdef FANCY_UPSAMPLING +//----------------------------------------------------------------------------- +// U/V upsampling + // Loads 9 pixels each from rows r1 and r2 and generates 16 pixels. #define UPSAMPLE_16PIXELS(r1, r2, out) { \ uint8x8_t a = vld1_u8(r1); \ @@ -85,105 +88,74 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2, Upsample16Pixels(r1, r2, out); \ } -#define CY 76283 -#define CVR 89858 -#define CUG 22014 -#define CVG 45773 -#define CUB 113618 +//----------------------------------------------------------------------------- +// YUV->RGB conversion -static const int16_t coef[4] = { CVR / 4, CUG, CVG / 2, CUB / 4 }; - -#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) { \ - int i; \ - for (i = 0; i < N; i += 8) { \ - int off = ((cur_x) + i) * XSTEP; \ - uint8x8_t y = vld1_u8(src_y + (cur_x) + i); \ - uint8x8_t u = vld1_u8((src_uv) + i); \ - uint8x8_t v = vld1_u8((src_uv) + i + 16); \ - int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); \ - int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); \ - int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); \ - \ - int16x8_t ud = vshlq_n_s16(uu, 1); \ - int16x8_t vd = vshlq_n_s16(vv, 1); \ - \ - int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), \ - vget_low_s16(vd), cf16, 0); \ - int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), \ - vget_high_s16(vd), cf16, 0); \ - int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), \ - vrshrn_n_s32(vrh, 16)); \ - \ - int32x4_t vl = vmovl_s16(vget_low_s16(vv)); \ - int32x4_t vh = vmovl_s16(vget_high_s16(vv)); \ - int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); \ - int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); \ - int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); \ - int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); \ - int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), \ - vrshrn_n_s32(gch, 16)); \ - \ - int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), \ - vget_low_s16(ud), cf16, 3); \ - int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), \ - vget_high_s16(ud), cf16, 3); \ - int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), \ - vrshrn_n_s32(ubh, 16)); \ - \ - int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); \ - int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); \ - int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); \ - int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); \ - int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); \ - int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); \ - \ - rl = vmulq_lane_s32(rl, cf32, 0); \ - rh = vmulq_lane_s32(rh, cf32, 0); \ - gl = vmulq_lane_s32(gl, cf32, 0); \ - gh = vmulq_lane_s32(gh, cf32, 0); \ - bl = vmulq_lane_s32(bl, cf32, 0); \ - bh = vmulq_lane_s32(bh, cf32, 0); \ - \ - y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), \ - vrshrn_n_s32(rh, 16))); \ - u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), \ - vrshrn_n_s32(gh, 16))); \ - v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), \ - vrshrn_n_s32(bh, 16))); \ - STR_ ## FMT(out + off, y, u, v); \ - } \ -} +static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG }; #define v255 vmov_n_u8(255) -#define STR_Rgb(out, r, g, b) do { \ +#define STORE_Rgb(out, r, g, b) do { \ const uint8x8x3_t r_g_b = {{ r, g, b }}; \ vst3_u8(out, r_g_b); \ } while (0) -#define STR_Bgr(out, r, g, b) do { \ +#define STORE_Bgr(out, r, g, b) do { \ const uint8x8x3_t b_g_r = {{ b, g, r }}; \ vst3_u8(out, b_g_r); \ } while (0) -#define STR_Rgba(out, r, g, b) do { \ +#define STORE_Rgba(out, r, g, b) do { \ const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }}; \ vst4_u8(out, r_g_b_v255); \ } while (0) -#define STR_Bgra(out, r, g, b) do { \ +#define STORE_Bgra(out, r, g, b) do { \ const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }}; \ vst4_u8(out, b_g_r_v255); \ } while (0) -#define CONVERT1(FMT, XSTEP, N, src_y, src_uv, rgb, cur_x) { \ +#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) { \ + int i; \ + for (i = 0; i < N; i += 8) { \ + const int off = ((cur_x) + i) * XSTEP; \ + uint8x8_t y = vld1_u8((src_y) + (cur_x) + i); \ + uint8x8_t u = vld1_u8((src_uv) + i); \ + uint8x8_t v = vld1_u8((src_uv) + i + 16); \ + const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); \ + const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); \ + const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); \ + int32x4_t yl = vmull_lane_s16(vget_low_s16(yy), cf16, 0); \ + int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0); \ + const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv), cf16, 1);\ + const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\ + int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu), cf16, 2); \ + int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2); \ + const int32x4_t bl = vmovl_s16(vget_low_s16(uu)); \ + const int32x4_t bh = vmovl_s16(vget_high_s16(uu)); \ + gl = vmlsl_lane_s16(gl, vget_low_s16(vv), cf16, 3); \ + gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3); \ + yl = vmlaq_lane_s32(yl, bl, cf32, 0); \ + yh = vmlaq_lane_s32(yh, bh, cf32, 0); \ + /* vrshrn_n_s32() already incorporates the rounding constant */ \ + y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2), \ + vrshrn_n_s32(rh, YUV_FIX2))); \ + u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2), \ + vrshrn_n_s32(gh, YUV_FIX2))); \ + v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2), \ + vrshrn_n_s32(yh, YUV_FIX2))); \ + STORE_ ## FMT(out + off, y, u, v); \ + } \ +} + +#define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) { \ int i; \ for (i = 0; i < N; i++) { \ - int off = ((cur_x) + i) * XSTEP; \ - int y = src_y[(cur_x) + i]; \ - int u = (src_uv)[i]; \ - int v = (src_uv)[i + 16]; \ - VP8YuvTo ## FMT(y, u, v, rgb + off); \ + const int off = ((cur_x) + i) * XSTEP; \ + const int y = src_y[(cur_x) + i]; \ + const int u = (src_uv)[i]; \ + const int v = (src_uv)[i + 16]; \ + FUNC(y, u, v, rgb + off); \ } \ } @@ -195,11 +167,11 @@ static const int16_t coef[4] = { CVR / 4, CUG, CVG / 2, CUB / 4 }; } \ } -#define CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, uv, \ +#define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv, \ top_dst, bottom_dst, cur_x, len) { \ - CONVERT1(FMT, XSTEP, len, top_y, uv, top_dst, cur_x); \ + CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x); \ if (bottom_y != NULL) { \ - CONVERT1(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \ + CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \ } \ } @@ -221,8 +193,8 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \ const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \ const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \ \ - const int16x4_t cf16 = vld1_s16(coef); \ - const int32x2_t cf32 = vmov_n_s32(CY); \ + const int16x4_t cf16 = vld1_s16(kCoeffs); \ + const int32x2_t cf32 = vmov_n_s32(kUToB); \ const uint8x8_t u16 = vmov_n_u8(16); \ const uint8x8_t u128 = vmov_n_u8(128); \ \ @@ -252,7 +224,7 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \ \ UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv); \ UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16); \ - CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, r_uv, \ + CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv, \ top_dst, bottom_dst, last_pos, len - last_pos); \ } diff --git a/src/dsp/upsampling_sse2.c b/src/dsp/upsampling_sse2.c index d2a1b535..c30cd3a0 100644 --- a/src/dsp/upsampling_sse2.c +++ b/src/dsp/upsampling_sse2.c @@ -87,8 +87,8 @@ extern "C" { GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \ \ /* pack the alternate pixels */ \ - PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]); \ - PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]); \ + PACK_AND_STORE(a, b, diag1, diag2, out + 0); /* store top */ \ + PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32); /* store bottom */ \ } // Turn the macro into a function for reducing code-size when non-critical @@ -108,68 +108,68 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[], Upsample32Pixels(r1, r2, out); \ } -#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv, \ +#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, \ top_dst, bottom_dst, cur_x, num_pixels) { \ int n; \ for (n = 0; n < (num_pixels); ++n) { \ - FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n], \ + FUNC(top_y[(cur_x) + n], r_u[n], r_v[n], \ top_dst + ((cur_x) + n) * XSTEP); \ } \ if (bottom_y != NULL) { \ for (n = 0; n < (num_pixels); ++n) { \ - FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n], \ + FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n], \ bottom_dst + ((cur_x) + n) * XSTEP); \ } \ } \ } +#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \ + top_dst, bottom_dst, cur_x) do { \ + FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP); \ + if (bottom_y != NULL) { \ + FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64, \ + bottom_dst + (cur_x) * XSTEP); \ + } \ +} while (0) + #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* cur_u, const uint8_t* cur_v, \ uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ - int block; \ - /* 16 byte aligned array to cache reconstructed u and v */ \ + int uv_pos, pos; \ + /* 16byte-aligned array to cache reconstructed u and v */ \ uint8_t uv_buf[4 * 32 + 15]; \ - uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \ - const int uv_len = (len + 1) >> 1; \ - /* 17 pixels must be read-able for each block */ \ - const int num_blocks = (uv_len - 1) >> 4; \ - const int leftover = uv_len - num_blocks * 16; \ - const int last_pos = 1 + 32 * num_blocks; \ + uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \ + uint8_t* const r_v = r_u + 32; \ \ - const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \ - const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \ - \ - assert(len > 0); \ - /* Treat the first pixel in regular way */ \ assert(top_y != NULL); \ - { \ - const int u0 = (top_u[0] + u_diag) >> 1; \ - const int v0 = (top_v[0] + v_diag) >> 1; \ - FUNC(top_y[0], u0, v0, top_dst); \ + { /* Treat the first pixel in regular way */ \ + const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \ + const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \ + const int u0_t = (top_u[0] + u_diag) >> 1; \ + const int v0_t = (top_v[0] + v_diag) >> 1; \ + FUNC(top_y[0], u0_t, v0_t, top_dst); \ + if (bottom_y != NULL) { \ + const int u0_b = (cur_u[0] + u_diag) >> 1; \ + const int v0_b = (cur_v[0] + v_diag) >> 1; \ + FUNC(bottom_y[0], u0_b, v0_b, bottom_dst); \ + } \ } \ - if (bottom_y != NULL) { \ - const int u0 = (cur_u[0] + u_diag) >> 1; \ - const int v0 = (cur_v[0] + v_diag) >> 1; \ - FUNC(bottom_y[0], u0, v0, bottom_dst); \ + /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */ \ + for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) { \ + UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u); \ + UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v); \ + CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos); \ } \ - \ - for (block = 0; block < num_blocks; ++block) { \ - UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32); \ - UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32); \ - CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \ - 32 * block + 1, 32) \ - top_u += 16; \ - cur_u += 16; \ - top_v += 16; \ - cur_v += 16; \ + if (len > 1) { \ + const int left_over = ((len + 1) >> 1) - (pos >> 1); \ + assert(left_over > 0); \ + UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \ + UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \ + CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, \ + pos, len - pos); \ } \ - \ - UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32); \ - UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32); \ - CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \ - last_pos, len - last_pos); \ } // SSE2 variants of the fancy upsampler. @@ -183,6 +183,7 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4) #undef UPSAMPLE_32PIXELS #undef UPSAMPLE_LAST_BLOCK #undef CONVERT2RGB +#undef CONVERT2RGB_32 #undef SSE2_UPSAMPLE_FUNC #endif // FANCY_UPSAMPLING @@ -197,6 +198,7 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; void WebPInitUpsamplersSSE2(void) { #if defined(WEBP_USE_SSE2) + VP8YUVInitSSE2(); WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePairSSE2; WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2; WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePairSSE2; @@ -214,7 +216,7 @@ void WebPInitPremultiplySSE2(void) { #else // this empty function is to avoid an empty .o -void WebPInitPremultiplySSE2(void); +void WebPInitPremultiplySSE2(void) {} #endif // FANCY_UPSAMPLING diff --git a/src/dsp/yuv.c b/src/dsp/yuv.c index 1a59f744..3e99892f 100644 --- a/src/dsp/yuv.c +++ b/src/dsp/yuv.c @@ -17,12 +17,8 @@ extern "C" { #endif -#ifdef WEBP_YUV_USE_TABLE -int16_t VP8kVToR[256], VP8kUToB[256]; -int32_t VP8kVToG[256], VP8kUToG[256]; -uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN]; -uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN]; +#if defined(WEBP_YUV_USE_TABLE) static int done = 0; @@ -30,6 +26,11 @@ static WEBP_INLINE uint8_t clip(int v, int max_value) { return v < 0 ? 0 : v > max_value ? max_value : v; } +int16_t VP8kVToR[256], VP8kUToB[256]; +int32_t VP8kVToG[256], VP8kUToG[256]; +uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN]; +uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN]; + void VP8YUVInit(void) { int i; if (done) { @@ -70,6 +71,144 @@ void VP8YUVInit(void) {} #endif // WEBP_YUV_USE_TABLE +//----------------------------------------------------------------------------- +// SSE2 extras + +#if defined(WEBP_USE_SSE2) + +#ifdef FANCY_UPSAMPLING + +#include +#include // for memcpy + +typedef union { // handy struct for converting SSE2 registers + int32_t i32[4]; + uint8_t u8[16]; + __m128i m; +} VP8kCstSSE2; + +static int done_sse2 = 0; +static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256]; + +void VP8YUVInitSSE2(void) { + if (!done_sse2) { + int i; + for (i = 0; i < 256; ++i) { + VP8kYtoRGBA[i].i32[0] = + VP8kYtoRGBA[i].i32[1] = + VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2; + VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2; + + VP8kUtoRGBA[i].i32[0] = 0; + VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128); + VP8kUtoRGBA[i].i32[2] = kUToB * (i - 128); + VP8kUtoRGBA[i].i32[3] = 0; + + VP8kVtoRGBA[i].i32[0] = kVToR * (i - 128); + VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128); + VP8kVtoRGBA[i].i32[2] = 0; + VP8kVtoRGBA[i].i32[3] = 0; + } + done_sse2 = 1; + } +} + +static WEBP_INLINE __m128i VP8GetRGBA32b(int y, int u, int v) { + const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m); + const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m); + const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m); + const __m128i uv_part = _mm_add_epi32(u_part, v_part); + const __m128i rgba1 = _mm_add_epi32(y_part, uv_part); + const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2); + return rgba2; +} + +static WEBP_INLINE void VP8YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const rgb) { + const __m128i tmp0 = VP8GetRGBA32b(y, u, v); + const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0); + const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1); + // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp + _mm_storel_epi64((__m128i*)rgb, tmp2); +} + +static WEBP_INLINE void VP8YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const bgr) { + const __m128i tmp0 = VP8GetRGBA32b(y, u, v); + const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2)); + const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1); + const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2); + // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp + _mm_storel_epi64((__m128i*)bgr, tmp3); +} + +void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { + int n; + for (n = 0; n < 32; n += 4) { + const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); + const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); + const __m128i tmp0_3 = VP8GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]); + const __m128i tmp0_4 = VP8GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]); + const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2); + const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4); + const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2); + _mm_storeu_si128((__m128i*)dst, tmp2); + dst += 4 * 4; + } +} + +void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { + int n; + for (n = 0; n < 32; n += 2) { + const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); + const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); + const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2)); + const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2)); + const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); + const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); + _mm_storel_epi64((__m128i*)dst, tmp3); + dst += 4 * 2; + } +} + +void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { + int n; + uint8_t tmp0[2 * 3 + 5 + 15]; + uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align + for (n = 0; n < 30; ++n) { // we directly stomp the *dst memory + VP8YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3); + } + // Last two pixels are special: we write in a tmp buffer before sending + // to dst. + VP8YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0); + VP8YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3); + memcpy(dst + n * 3, tmp, 2 * 3); +} + +void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { + int n; + uint8_t tmp0[2 * 3 + 5 + 15]; + uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align + for (n = 0; n < 30; ++n) { + VP8YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3); + } + VP8YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0); + VP8YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3); + memcpy(dst + n * 3, tmp, 2 * 3); +} + +#else + +void VP8YUVInitSSE2(void) {} + +#endif // FANCY_UPSAMPLING + +#endif // WEBP_USE_SSE2 + #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" #endif diff --git a/src/dsp/yuv.h b/src/dsp/yuv.h index 3844d8ca..e00a3e56 100644 --- a/src/dsp/yuv.h +++ b/src/dsp/yuv.h @@ -14,7 +14,7 @@ // Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16 // U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128 // V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128 -// We use 16bit fixed point operations for RGB->YUV conversion. +// We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX). // // For the Y'CbCr to RGB conversion, the BT.601 specification reads: // R = 1.164 * (Y-16) + 1.596 * (V-128) @@ -23,21 +23,24 @@ // where Y is in the [16,235] range, and U/V in the [16,240] range. // In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor // "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table. -// So in this case the formulae should be read as: +// So in this case the formulae should read: // R = 1.164 * [Y + 1.371 * (V-128) ] - 18.624 // G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624 // B = 1.164 * [Y + 1.733 * (U-128)] - 18.624 -// once factorized. Here too, 16bit fixed precision is used. +// once factorized. +// For YUV->RGB conversion, only 14bit fixed precision is used (YUV_FIX2). +// That's the maximum possible for a convenient ARM implementation. // // Author: Skal (pascal.massimino@gmail.com) #ifndef WEBP_DSP_YUV_H_ #define WEBP_DSP_YUV_H_ +#include "./dsp.h" #include "../dec/decode_vp8.h" // Define the following to use the LUT-based code: -#define WEBP_YUV_USE_TABLE +// #define WEBP_YUV_USE_TABLE #if defined(WEBP_EXPERIMENTAL_FEATURES) // Do NOT activate this feature for real compression. This is only experimental! @@ -56,14 +59,100 @@ extern "C" { #endif -enum { YUV_FIX = 16, // fixed-point precision +enum { + YUV_FIX = 16, // fixed-point precision for RGB->YUV YUV_HALF = 1 << (YUV_FIX - 1), YUV_MASK = (256 << YUV_FIX) - 1, YUV_RANGE_MIN = -227, // min value of r/g/b output - YUV_RANGE_MAX = 256 + 226 // max value of r/g/b output + YUV_RANGE_MAX = 256 + 226, // max value of r/g/b output + + YUV_FIX2 = 14, // fixed-point precision for YUV->RGB + YUV_HALF2 = 1 << (YUV_FIX2 - 1), + YUV_MASK2 = (256 << YUV_FIX2) - 1 }; -#ifdef WEBP_YUV_USE_TABLE +// These constants are 14b fixed-point version of ITU-R BT.601 constants. +#define kYScale 19077 // 1.164 = 255 / 219 +#define kVToR 26149 // 1.596 = 255 / 112 * 0.701 +#define kUToG 6419 // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587 +#define kVToG 13320 // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587 +#define kUToB 33050 // 2.018 = 255 / 112 * 0.886 +#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF2) +#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF2) +#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF2) + +//------------------------------------------------------------------------------ + +#ifndef WEBP_YUV_USE_TABLE + +// slower on x86 by ~7-8%, but bit-exact with the SSE2 version + +static WEBP_INLINE uint8_t VP8Clip8(int v) { + return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : ((~v) >> 31); +} + +static WEBP_INLINE uint8_t VP8YUVToR(int y, int v) { + return VP8Clip8(kYScale * y + kVToR * v + kRCst); +} + +static WEBP_INLINE uint8_t VP8YUVToG(int y, int u, int v) { + return VP8Clip8(kYScale * y - kUToG * u - kVToG * v + kGCst); +} + +static WEBP_INLINE uint8_t VP8YUVToB(int y, int u) { + return VP8Clip8(kYScale * y + kUToB * u + kBCst); +} + +static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const rgb) { + rgb[0] = VP8YUVToR(y, v); + rgb[1] = VP8YUVToG(y, u, v); + rgb[2] = VP8YUVToB(y, u); +} + +static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const bgr) { + bgr[0] = VP8YUVToB(y, u); + bgr[1] = VP8YUVToG(y, u, v); + bgr[2] = VP8YUVToR(y, v); +} + +static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const rgb) { + const int r = VP8YUVToR(y, v); // 5 usable bits + const int g = VP8YUVToG(y, u, v); // 6 usable bits + const int b = VP8YUVToB(y, u); // 5 usable bits + const int rg = (r & 0xf8) | (g >> 5); + const int gb = ((g << 3) & 0xe0) | (b >> 3); +#ifdef WEBP_SWAP_16BIT_CSP + rgb[0] = gb; + rgb[1] = rg; +#else + rgb[0] = rg; + rgb[1] = gb; +#endif +} + +static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const argb) { + const int r = VP8YUVToR(y, v); // 4 usable bits + const int g = VP8YUVToG(y, u, v); // 4 usable bits + const int b = VP8YUVToB(y, u); // 4 usable bits + const int rg = (r & 0xf0) | (g >> 4); + const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits +#ifdef WEBP_SWAP_16BIT_CSP + argb[0] = ba; + argb[1] = rg; +#else + argb[0] = rg; + argb[1] = ba; +#endif +} + +#else + +// Table-based version, not totally equivalent to the SSE2 version. +// Rounding diff is only +/-1 though. extern int16_t VP8kVToR[256], VP8kUToB[256]; extern int32_t VP8kVToG[256], VP8kUToG[256]; @@ -125,88 +214,11 @@ static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v, #endif } -#else // Table-free version (slower on x86) - -// These constants are 16b fixed-point version of ITU-R BT.601 constants -#define kYScale 76309 // 1.164 = 255 / 219 -#define kVToR 104597 // 1.596 = 255 / 112 * 0.701 -#define kUToG 25674 // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587 -#define kVToG 53278 // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587 -#define kUToB 132201 // 2.018 = 255 / 112 * 0.886 -#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF) -#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF) -#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF) - -static WEBP_INLINE uint8_t VP8Clip8(int v) { - return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> YUV_FIX) - : (v < 0) ? 0u : 255u; -} - -static WEBP_INLINE uint8_t VP8ClipN(int v, int N) { // clip to N bits - return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> (YUV_FIX + (8 - N))) - : (v < 0) ? 0u : (255u >> (8 - N)); -} - -static WEBP_INLINE int VP8YUVToR(int y, int v) { - return kYScale * y + kVToR * v + kRCst; -} - -static WEBP_INLINE int VP8YUVToG(int y, int u, int v) { - return kYScale * y - kUToG * u - kVToG * v + kGCst; -} - -static WEBP_INLINE int VP8YUVToB(int y, int u) { - return kYScale * y + kUToB * u + kBCst; -} - -static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v, - uint8_t* const rgb) { - rgb[0] = VP8Clip8(VP8YUVToR(y, v)); - rgb[1] = VP8Clip8(VP8YUVToG(y, u, v)); - rgb[2] = VP8Clip8(VP8YUVToB(y, u)); -} - -static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v, - uint8_t* const bgr) { - bgr[0] = VP8Clip8(VP8YUVToB(y, u)); - bgr[1] = VP8Clip8(VP8YUVToG(y, u, v)); - bgr[2] = VP8Clip8(VP8YUVToR(y, v)); -} - -static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v, - uint8_t* const rgb) { - const int r = VP8Clip8(VP8YUVToR(y, u)); - const int g = VP8ClipN(VP8YUVToG(y, u, v), 6); - const int b = VP8ClipN(VP8YUVToB(y, v), 5); - const uint8_t rg = (r & 0xf8) | (g >> 3); - const uint8_t gb = (g << 5) | b; -#ifdef WEBP_SWAP_16BIT_CSP - rgb[0] = gb; - rgb[1] = rg; -#else - rgb[0] = rg; - rgb[1] = gb; -#endif -} - -static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v, - uint8_t* const argb) { - const int r = VP8Clip8(VP8YUVToR(y, u)); - const int g = VP8ClipN(VP8YUVToG(y, u, v), 4); - const int b = VP8Clip8(VP8YUVToB(y, v)); - const uint8_t rg = (r & 0xf0) | g; - const uint8_t ba = b | 0x0f; // overwrite the lower 4 bits -#ifdef WEBP_SWAP_16BIT_CSP - argb[0] = ba; - argb[1] = rg; -#else - argb[0] = rg; - argb[1] = ba; -#endif -} - #endif // WEBP_YUV_USE_TABLE +//----------------------------------------------------------------------------- +// Alpha handling variants + static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) { argb[0] = 0xff; @@ -228,6 +240,28 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v, // Must be called before everything, to initialize the tables. void VP8YUVInit(void); +//----------------------------------------------------------------------------- +// SSE2 extra functions (mostly for upsampling_sse2.c) + +#if defined(WEBP_USE_SSE2) + +#ifdef FANCY_UPSAMPLING +// Process 32 pixels and store the result (24b or 32b per pixel) in *dst. +extern void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +extern void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +extern void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +extern void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +#endif // FANCY_UPSAMPLING + +// Must be called to initialize tables before using the functions. +extern void VP8YUVInitSSE2(void); + +#endif // WEBP_USE_SSE2 + //------------------------------------------------------------------------------ // RGB -> YUV conversion @@ -257,7 +291,7 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b) { #else // This JPEG-YUV colorspace, only for comparison! -// These are also 16-bit precision coefficients from Rec.601, but with full +// These are also 16bit precision coefficients from Rec.601, but with full // [0..255] output range. static WEBP_INLINE int VP8RGBToY(int r, int g, int b) { const int kRound = (1 << (YUV_FIX - 1));