From df6cebfa9ebe47fbfb657696722786b5cacc6ace Mon Sep 17 00:00:00 2001
From: skal <pascal.massimino@gmail.com>
Date: Mon, 19 Aug 2013 12:43:53 -0700
Subject: [PATCH] 5-7% faster SSE2 versions of YUV->RGB conversion functions

The C-version gets ~7-8% slower in order to match the SSE2
output exactly. The old (now off-by-1) code is kept under
the WEBP_YUV_USE_TABLE flag for reference.

(note that calc rounding precision is slightly better ~= +0.02dB)

on ARM-neon, we somehow recover the ~4% speed that was lost by mimicking
the initial C-version (see https://gerrit.chromium.org/gerrit/#/c/41610)

Change-Id: Ia4363c5ed9b4c9edff5d932b002e57bb7814bf6f
---
 src/dsp/upsampling_neon.c | 138 ++++++++++---------------
 src/dsp/upsampling_sse2.c |  86 ++++++++--------
 src/dsp/yuv.c             | 149 ++++++++++++++++++++++++++-
 src/dsp/yuv.h             | 210 ++++++++++++++++++++++----------------
 4 files changed, 365 insertions(+), 218 deletions(-)

diff --git a/src/dsp/upsampling_neon.c b/src/dsp/upsampling_neon.c
index 0b64ed86..f7450932 100644
--- a/src/dsp/upsampling_neon.c
+++ b/src/dsp/upsampling_neon.c
@@ -27,6 +27,9 @@ extern "C" {
 
 #ifdef FANCY_UPSAMPLING
 
+//-----------------------------------------------------------------------------
+// U/V upsampling
+
 // Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
 #define UPSAMPLE_16PIXELS(r1, r2, out) {                                \
   uint8x8_t a = vld1_u8(r1);                                            \
@@ -85,105 +88,74 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
   Upsample16Pixels(r1, r2, out);                                        \
 }
 
-#define CY  76283
-#define CVR 89858
-#define CUG 22014
-#define CVG 45773
-#define CUB 113618
+//-----------------------------------------------------------------------------
+// YUV->RGB conversion
 
-static const int16_t coef[4] = { CVR / 4, CUG, CVG / 2, CUB / 4 };
-
-#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) {            \
-  int i;                                                                \
-  for (i = 0; i < N; i += 8) {                                          \
-    int off = ((cur_x) + i) * XSTEP;                                    \
-    uint8x8_t y  = vld1_u8(src_y + (cur_x)  + i);                       \
-    uint8x8_t u  = vld1_u8((src_uv) + i);                               \
-    uint8x8_t v  = vld1_u8((src_uv) + i + 16);                          \
-    int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));             \
-    int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));            \
-    int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));            \
-                                                                        \
-    int16x8_t ud = vshlq_n_s16(uu, 1);                                  \
-    int16x8_t vd = vshlq_n_s16(vv, 1);                                  \
-                                                                        \
-    int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1),  \
-                                     vget_low_s16(vd),  cf16, 0);       \
-    int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), \
-                                     vget_high_s16(vd), cf16, 0);       \
-    int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16),                  \
-                                vrshrn_n_s32(vrh, 16));                 \
-                                                                        \
-    int32x4_t vl = vmovl_s16(vget_low_s16(vv));                         \
-    int32x4_t vh = vmovl_s16(vget_high_s16(vv));                        \
-    int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu),  cf16, 1);     \
-    int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);     \
-    int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv),  cf16, 2);  \
-    int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);  \
-    int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16),                  \
-                                vrshrn_n_s32(gch, 16));                 \
-                                                                        \
-    int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1),  \
-                                     vget_low_s16(ud),  cf16, 3);       \
-    int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), \
-                                     vget_high_s16(ud), cf16, 3);       \
-    int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16),                  \
-                                vrshrn_n_s32(ubh, 16));                 \
-                                                                        \
-    int32x4_t rl = vaddl_s16(vget_low_s16(yy),  vget_low_s16(vr));      \
-    int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));     \
-    int32x4_t gl = vsubl_s16(vget_low_s16(yy),  vget_low_s16(gc));      \
-    int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));     \
-    int32x4_t bl = vaddl_s16(vget_low_s16(yy),  vget_low_s16(ub));      \
-    int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));     \
-                                                                        \
-    rl = vmulq_lane_s32(rl, cf32, 0);                                   \
-    rh = vmulq_lane_s32(rh, cf32, 0);                                   \
-    gl = vmulq_lane_s32(gl, cf32, 0);                                   \
-    gh = vmulq_lane_s32(gh, cf32, 0);                                   \
-    bl = vmulq_lane_s32(bl, cf32, 0);                                   \
-    bh = vmulq_lane_s32(bh, cf32, 0);                                   \
-                                                                        \
-    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16),                  \
-                                 vrshrn_n_s32(rh, 16)));                \
-    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16),                  \
-                                 vrshrn_n_s32(gh, 16)));                \
-    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16),                  \
-                                 vrshrn_n_s32(bh, 16)));                \
-    STR_ ## FMT(out + off, y, u, v);                                    \
-  }                                                                     \
-}
+static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
 
 #define v255 vmov_n_u8(255)
 
-#define STR_Rgb(out, r, g, b) do {                                      \
+#define STORE_Rgb(out, r, g, b) do {                                    \
   const uint8x8x3_t r_g_b = {{ r, g, b }};                              \
   vst3_u8(out, r_g_b);                                                  \
 } while (0)
 
-#define STR_Bgr(out, r, g, b) do {                                      \
+#define STORE_Bgr(out, r, g, b) do {                                    \
   const uint8x8x3_t b_g_r = {{ b, g, r }};                              \
   vst3_u8(out, b_g_r);                                                  \
 } while (0)
 
-#define STR_Rgba(out, r, g, b) do {                                     \
+#define STORE_Rgba(out, r, g, b) do {                                   \
   const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }};                   \
   vst4_u8(out, r_g_b_v255);                                             \
 } while (0)
 
-#define STR_Bgra(out, r, g, b) do {                                     \
+#define STORE_Bgra(out, r, g, b) do {                                   \
   const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }};                   \
   vst4_u8(out, b_g_r_v255);                                             \
 } while (0)
 
-#define CONVERT1(FMT, XSTEP, N, src_y, src_uv, rgb, cur_x) {            \
+#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) {            \
+  int i;                                                                \
+  for (i = 0; i < N; i += 8) {                                          \
+    const int off = ((cur_x) + i) * XSTEP;                              \
+    uint8x8_t y  = vld1_u8((src_y) + (cur_x)  + i);                     \
+    uint8x8_t u  = vld1_u8((src_uv) + i);                               \
+    uint8x8_t v  = vld1_u8((src_uv) + i + 16);                          \
+    const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));       \
+    const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));      \
+    const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));      \
+    int32x4_t yl = vmull_lane_s16(vget_low_s16(yy),  cf16, 0);          \
+    int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0);          \
+    const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv),  cf16, 1);\
+    const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\
+    int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu),  cf16, 2);      \
+    int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2);      \
+    const int32x4_t bl = vmovl_s16(vget_low_s16(uu));                   \
+    const int32x4_t bh = vmovl_s16(vget_high_s16(uu));                  \
+    gl = vmlsl_lane_s16(gl, vget_low_s16(vv),  cf16, 3);                \
+    gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3);                \
+    yl = vmlaq_lane_s32(yl, bl, cf32, 0);                               \
+    yh = vmlaq_lane_s32(yh, bh, cf32, 0);                               \
+    /* vrshrn_n_s32() already incorporates the rounding constant */     \
+    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2),            \
+                                 vrshrn_n_s32(rh, YUV_FIX2)));          \
+    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2),            \
+                                 vrshrn_n_s32(gh, YUV_FIX2)));          \
+    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2),            \
+                                 vrshrn_n_s32(yh, YUV_FIX2)));          \
+    STORE_ ## FMT(out + off, y, u, v);                                  \
+  }                                                                     \
+}
+
+#define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) {           \
   int i;                                                                \
   for (i = 0; i < N; i++) {                                             \
-    int off = ((cur_x) + i) * XSTEP;                                    \
-    int y = src_y[(cur_x) + i];                                         \
-    int u = (src_uv)[i];                                                \
-    int v = (src_uv)[i + 16];                                           \
-    VP8YuvTo ## FMT(y, u, v, rgb + off);                                \
+    const int off = ((cur_x) + i) * XSTEP;                              \
+    const int y = src_y[(cur_x) + i];                                   \
+    const int u = (src_uv)[i];                                          \
+    const int v = (src_uv)[i + 16];                                     \
+    FUNC(y, u, v, rgb + off);                                           \
   }                                                                     \
 }
 
@@ -195,11 +167,11 @@ static const int16_t coef[4] = { CVR / 4, CUG, CVG / 2, CUB / 4 };
   }                                                                     \
 }
 
-#define CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, uv,                  \
+#define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv,                 \
                       top_dst, bottom_dst, cur_x, len) {                \
-  CONVERT1(FMT, XSTEP, len, top_y, uv, top_dst, cur_x);                 \
+  CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x);                \
   if (bottom_y != NULL) {                                               \
-    CONVERT1(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x);  \
+    CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
   }                                                                     \
 }
 
@@ -221,8 +193,8 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
   const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                  \
   const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                  \
                                                                         \
-  const int16x4_t cf16 = vld1_s16(coef);                                \
-  const int32x2_t cf32 = vmov_n_s32(CY);                                \
+  const int16x4_t cf16 = vld1_s16(kCoeffs);                             \
+  const int32x2_t cf32 = vmov_n_s32(kUToB);                             \
   const uint8x8_t u16  = vmov_n_u8(16);                                 \
   const uint8x8_t u128 = vmov_n_u8(128);                                \
                                                                         \
@@ -252,7 +224,7 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
                                                                         \
   UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv);                    \
   UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16);               \
-  CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, r_uv,                      \
+  CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv,          \
                 top_dst, bottom_dst, last_pos, len - last_pos);         \
 }
 
diff --git a/src/dsp/upsampling_sse2.c b/src/dsp/upsampling_sse2.c
index d2a1b535..c30cd3a0 100644
--- a/src/dsp/upsampling_sse2.c
+++ b/src/dsp/upsampling_sse2.c
@@ -87,8 +87,8 @@ extern "C" {
   GET_M(ad, s, diag2);                  /* diag2 = (3a + b + c + 3d) / 8 */    \
                                                                                \
   /* pack the alternate pixels */                                              \
-  PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]);                          \
-  PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]);                          \
+  PACK_AND_STORE(a, b, diag1, diag2, out +      0);  /* store top */           \
+  PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32);  /* store bottom */        \
 }
 
 // Turn the macro into a function for reducing code-size when non-critical
@@ -108,68 +108,68 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
   Upsample32Pixels(r1, r2, out);                                               \
 }
 
-#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv,                          \
+#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y,                              \
                     top_dst, bottom_dst, cur_x, num_pixels) {                  \
   int n;                                                                       \
   for (n = 0; n < (num_pixels); ++n) {                                         \
-    FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n],                            \
+    FUNC(top_y[(cur_x) + n], r_u[n], r_v[n],                                   \
          top_dst + ((cur_x) + n) * XSTEP);                                     \
   }                                                                            \
   if (bottom_y != NULL) {                                                      \
     for (n = 0; n < (num_pixels); ++n) {                                       \
-      FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n],             \
+      FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n],                    \
            bottom_dst + ((cur_x) + n) * XSTEP);                                \
     }                                                                          \
   }                                                                            \
 }
 
+#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
+                       top_dst, bottom_dst, cur_x) do {                        \
+  FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP);              \
+  if (bottom_y != NULL) {                                                      \
+    FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64,                           \
+             bottom_dst + (cur_x) * XSTEP);                                    \
+  }                                                                            \
+} while (0)
+
 #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
                       const uint8_t* top_u, const uint8_t* top_v,              \
                       const uint8_t* cur_u, const uint8_t* cur_v,              \
                       uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
-  int block;                                                                   \
-  /* 16 byte aligned array to cache reconstructed u and v */                   \
+  int uv_pos, pos;                                                             \
+  /* 16byte-aligned array to cache reconstructed u and v */                    \
   uint8_t uv_buf[4 * 32 + 15];                                                 \
-  uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);            \
-  const int uv_len = (len + 1) >> 1;                                           \
-  /* 17 pixels must be read-able for each block */                             \
-  const int num_blocks = (uv_len - 1) >> 4;                                    \
-  const int leftover = uv_len - num_blocks * 16;                               \
-  const int last_pos = 1 + 32 * num_blocks;                                    \
+  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
+  uint8_t* const r_v = r_u + 32;                                               \
                                                                                \
-  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                         \
-  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                         \
-                                                                               \
-  assert(len > 0);                                                             \
-  /* Treat the first pixel in regular way */                                   \
   assert(top_y != NULL);                                                       \
-  {                                                                            \
-    const int u0 = (top_u[0] + u_diag) >> 1;                                   \
-    const int v0 = (top_v[0] + v_diag) >> 1;                                   \
-    FUNC(top_y[0], u0, v0, top_dst);                                           \
+  {   /* Treat the first pixel in regular way */                               \
+    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                       \
+    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                       \
+    const int u0_t = (top_u[0] + u_diag) >> 1;                                 \
+    const int v0_t = (top_v[0] + v_diag) >> 1;                                 \
+    FUNC(top_y[0], u0_t, v0_t, top_dst);                                       \
+    if (bottom_y != NULL) {                                                    \
+      const int u0_b = (cur_u[0] + u_diag) >> 1;                               \
+      const int v0_b = (cur_v[0] + v_diag) >> 1;                               \
+      FUNC(bottom_y[0], u0_b, v0_b, bottom_dst);                               \
+    }                                                                          \
   }                                                                            \
-  if (bottom_y != NULL) {                                                      \
-    const int u0 = (cur_u[0] + u_diag) >> 1;                                   \
-    const int v0 = (cur_v[0] + v_diag) >> 1;                                   \
-    FUNC(bottom_y[0], u0, v0, bottom_dst);                                     \
+  /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */  \
+  for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) {    \
+    UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u);                    \
+    UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v);                    \
+    CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos);    \
   }                                                                            \
-                                                                               \
-  for (block = 0; block < num_blocks; ++block) {                               \
-    UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32);                            \
-    UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32);                            \
-    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,       \
-                32 * block + 1, 32)                                            \
-    top_u += 16;                                                               \
-    cur_u += 16;                                                               \
-    top_v += 16;                                                               \
-    cur_v += 16;                                                               \
+  if (len > 1) {                                                               \
+    const int left_over = ((len + 1) >> 1) - (pos >> 1);                       \
+    assert(left_over > 0);                                                     \
+    UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u);       \
+    UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v);       \
+    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst,             \
+                pos, len - pos);                                               \
   }                                                                            \
-                                                                               \
-  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32);                  \
-  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32);                  \
-  CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,         \
-              last_pos, len - last_pos);                                       \
 }
 
 // SSE2 variants of the fancy upsampler.
@@ -183,6 +183,7 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
 #undef UPSAMPLE_32PIXELS
 #undef UPSAMPLE_LAST_BLOCK
 #undef CONVERT2RGB
+#undef CONVERT2RGB_32
 #undef SSE2_UPSAMPLE_FUNC
 
 #endif  // FANCY_UPSAMPLING
@@ -197,6 +198,7 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 
 void WebPInitUpsamplersSSE2(void) {
 #if defined(WEBP_USE_SSE2)
+  VP8YUVInitSSE2();
   WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairSSE2;
   WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
   WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairSSE2;
@@ -214,7 +216,7 @@ void WebPInitPremultiplySSE2(void) {
 #else
 
 // this empty function is to avoid an empty .o
-void WebPInitPremultiplySSE2(void);
+void WebPInitPremultiplySSE2(void) {}
 
 #endif  // FANCY_UPSAMPLING
 
diff --git a/src/dsp/yuv.c b/src/dsp/yuv.c
index 1a59f744..3e99892f 100644
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@@ -17,12 +17,8 @@
 extern "C" {
 #endif
 
-#ifdef WEBP_YUV_USE_TABLE
 
-int16_t VP8kVToR[256], VP8kUToB[256];
-int32_t VP8kVToG[256], VP8kUToG[256];
-uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
+#if defined(WEBP_YUV_USE_TABLE)
 
 static int done = 0;
 
@@ -30,6 +26,11 @@ static WEBP_INLINE uint8_t clip(int v, int max_value) {
   return v < 0 ? 0 : v > max_value ? max_value : v;
 }
 
+int16_t VP8kVToR[256], VP8kUToB[256];
+int32_t VP8kVToG[256], VP8kUToG[256];
+uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
+
 void VP8YUVInit(void) {
   int i;
   if (done) {
@@ -70,6 +71,144 @@ void VP8YUVInit(void) {}
 
 #endif  // WEBP_YUV_USE_TABLE
 
+//-----------------------------------------------------------------------------
+// SSE2 extras
+
+#if defined(WEBP_USE_SSE2)
+
+#ifdef FANCY_UPSAMPLING
+
+#include <emmintrin.h>
+#include <string.h>   // for memcpy
+
+typedef union {   // handy struct for converting SSE2 registers
+  int32_t i32[4];
+  uint8_t u8[16];
+  __m128i m;
+} VP8kCstSSE2;
+
+static int done_sse2 = 0;
+static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256];
+
+void VP8YUVInitSSE2(void) {
+  if (!done_sse2) {
+    int i;
+    for (i = 0; i < 256; ++i) {
+      VP8kYtoRGBA[i].i32[0] =
+        VP8kYtoRGBA[i].i32[1] =
+        VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2;
+      VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2;
+
+      VP8kUtoRGBA[i].i32[0] = 0;
+      VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128);
+      VP8kUtoRGBA[i].i32[2] =  kUToB * (i - 128);
+      VP8kUtoRGBA[i].i32[3] = 0;
+
+      VP8kVtoRGBA[i].i32[0] =  kVToR * (i - 128);
+      VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128);
+      VP8kVtoRGBA[i].i32[2] = 0;
+      VP8kVtoRGBA[i].i32[3] = 0;
+    }
+    done_sse2 = 1;
+  }
+}
+
+static WEBP_INLINE __m128i VP8GetRGBA32b(int y, int u, int v) {
+  const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
+  const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
+  const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
+  const __m128i uv_part = _mm_add_epi32(u_part, v_part);
+  const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
+  const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
+  return rgba2;
+}
+
+static WEBP_INLINE void VP8YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v,
+                                        uint8_t* const rgb) {
+  const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
+  const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0);
+  const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1);
+  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
+  _mm_storel_epi64((__m128i*)rgb, tmp2);
+}
+
+static WEBP_INLINE void VP8YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v,
+                                        uint8_t* const bgr) {
+  const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
+  const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2));
+  const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1);
+  const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2);
+  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
+  _mm_storel_epi64((__m128i*)bgr, tmp3);
+}
+
+void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst) {
+  int n;
+  for (n = 0; n < 32; n += 4) {
+    const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
+    const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
+    const __m128i tmp0_3 = VP8GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]);
+    const __m128i tmp0_4 = VP8GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]);
+    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
+    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
+    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
+    _mm_storeu_si128((__m128i*)dst, tmp2);
+    dst += 4 * 4;
+  }
+}
+
+void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst) {
+  int n;
+  for (n = 0; n < 32; n += 2) {
+    const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
+    const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
+    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
+    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
+    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
+    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
+    _mm_storel_epi64((__m128i*)dst, tmp3);
+    dst += 4 * 2;
+  }
+}
+
+void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst) {
+  int n;
+  uint8_t tmp0[2 * 3 + 5 + 15];
+  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
+  for (n = 0; n < 30; ++n) {   // we directly stomp the *dst memory
+    VP8YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3);
+  }
+  // Last two pixels are special: we write in a tmp buffer before sending
+  // to dst.
+  VP8YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
+  VP8YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
+  memcpy(dst + n * 3, tmp, 2 * 3);
+}
+
+void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst) {
+  int n;
+  uint8_t tmp0[2 * 3 + 5 + 15];
+  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
+  for (n = 0; n < 30; ++n) {
+    VP8YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3);
+  }
+  VP8YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
+  VP8YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
+  memcpy(dst + n * 3, tmp, 2 * 3);
+}
+
+#else
+
+void VP8YUVInitSSE2(void) {}
+
+#endif  // FANCY_UPSAMPLING
+
+#endif  // WEBP_USE_SSE2
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
diff --git a/src/dsp/yuv.h b/src/dsp/yuv.h
index 3844d8ca..e00a3e56 100644
--- a/src/dsp/yuv.h
+++ b/src/dsp/yuv.h
@@ -14,7 +14,7 @@
 // Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
 // U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
 // V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
-// We use 16bit fixed point operations for RGB->YUV conversion.
+// We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
 //
 // For the Y'CbCr to RGB conversion, the BT.601 specification reads:
 //   R = 1.164 * (Y-16) + 1.596 * (V-128)
@@ -23,21 +23,24 @@
 // where Y is in the [16,235] range, and U/V in the [16,240] range.
 // In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor
 // "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table.
-// So in this case the formulae should be read as:
+// So in this case the formulae should read:
 //   R = 1.164 * [Y + 1.371 * (V-128)                  ] - 18.624
 //   G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624
 //   B = 1.164 * [Y                   + 1.733 * (U-128)] - 18.624
-// once factorized. Here too, 16bit fixed precision is used.
+// once factorized.
+// For YUV->RGB conversion, only 14bit fixed precision is used (YUV_FIX2).
+// That's the maximum possible for a convenient ARM implementation.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
 #ifndef WEBP_DSP_YUV_H_
 #define WEBP_DSP_YUV_H_
 
+#include "./dsp.h"
 #include "../dec/decode_vp8.h"
 
 // Define the following to use the LUT-based code:
-#define WEBP_YUV_USE_TABLE
+// #define WEBP_YUV_USE_TABLE
 
 #if defined(WEBP_EXPERIMENTAL_FEATURES)
 // Do NOT activate this feature for real compression. This is only experimental!
@@ -56,14 +59,100 @@
 extern "C" {
 #endif
 
-enum { YUV_FIX = 16,                // fixed-point precision
+enum {
+  YUV_FIX = 16,                    // fixed-point precision for RGB->YUV
        YUV_HALF = 1 << (YUV_FIX - 1),
        YUV_MASK = (256 << YUV_FIX) - 1,
        YUV_RANGE_MIN = -227,        // min value of r/g/b output
-       YUV_RANGE_MAX = 256 + 226    // max value of r/g/b output
+  YUV_RANGE_MAX = 256 + 226,       // max value of r/g/b output
+
+  YUV_FIX2 = 14,                   // fixed-point precision for YUV->RGB
+  YUV_HALF2 = 1 << (YUV_FIX2 - 1),
+  YUV_MASK2 = (256 << YUV_FIX2) - 1
 };
 
-#ifdef WEBP_YUV_USE_TABLE
+// These constants are 14b fixed-point version of ITU-R BT.601 constants.
+#define kYScale 19077    // 1.164 = 255 / 219
+#define kVToR   26149    // 1.596 = 255 / 112 * 0.701
+#define kUToG   6419     // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
+#define kVToG   13320    // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
+#define kUToB   33050    // 2.018 = 255 / 112 * 0.886
+#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF2)
+#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF2)
+#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF2)
+
+//------------------------------------------------------------------------------
+
+#ifndef WEBP_YUV_USE_TABLE
+
+// slower on x86 by ~7-8%, but bit-exact with the SSE2 version
+
+static WEBP_INLINE uint8_t VP8Clip8(int v) {
+  return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : ((~v) >> 31);
+}
+
+static WEBP_INLINE uint8_t VP8YUVToR(int y, int v) {
+  return VP8Clip8(kYScale * y + kVToR * v + kRCst);
+}
+
+static WEBP_INLINE uint8_t VP8YUVToG(int y, int u, int v) {
+  return VP8Clip8(kYScale * y - kUToG * u - kVToG * v + kGCst);
+}
+
+static WEBP_INLINE uint8_t VP8YUVToB(int y, int u) {
+  return VP8Clip8(kYScale * y  + kUToB * u + kBCst);
+}
+
+static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+                                    uint8_t* const rgb) {
+  rgb[0] = VP8YUVToR(y, v);
+  rgb[1] = VP8YUVToG(y, u, v);
+  rgb[2] = VP8YUVToB(y, u);
+}
+
+static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+                                    uint8_t* const bgr) {
+  bgr[0] = VP8YUVToB(y, u);
+  bgr[1] = VP8YUVToG(y, u, v);
+  bgr[2] = VP8YUVToR(y, v);
+}
+
+static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
+                                       uint8_t* const rgb) {
+  const int r = VP8YUVToR(y, v);      // 5 usable bits
+  const int g = VP8YUVToG(y, u, v);   // 6 usable bits
+  const int b = VP8YUVToB(y, u);      // 5 usable bits
+  const int rg = (r & 0xf8) | (g >> 5);
+  const int gb = ((g << 3) & 0xe0) | (b >> 3);
+#ifdef WEBP_SWAP_16BIT_CSP
+  rgb[0] = gb;
+  rgb[1] = rg;
+#else
+  rgb[0] = rg;
+  rgb[1] = gb;
+#endif
+}
+
+static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
+                                         uint8_t* const argb) {
+  const int r = VP8YUVToR(y, v);        // 4 usable bits
+  const int g = VP8YUVToG(y, u, v);     // 4 usable bits
+  const int b = VP8YUVToB(y, u);        // 4 usable bits
+  const int rg = (r & 0xf0) | (g >> 4);
+  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
+#ifdef WEBP_SWAP_16BIT_CSP
+  argb[0] = ba;
+  argb[1] = rg;
+#else
+  argb[0] = rg;
+  argb[1] = ba;
+#endif
+}
+
+#else
+
+// Table-based version, not totally equivalent to the SSE2 version.
+// Rounding diff is only +/-1 though.
 
 extern int16_t VP8kVToR[256], VP8kUToB[256];
 extern int32_t VP8kVToG[256], VP8kUToG[256];
@@ -125,88 +214,11 @@ static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
 #endif
 }
 
-#else   // Table-free version (slower on x86)
-
-// These constants are 16b fixed-point version of ITU-R BT.601 constants
-#define kYScale 76309      // 1.164 = 255 / 219
-#define kVToR   104597     // 1.596 = 255 / 112 * 0.701
-#define kUToG   25674      // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
-#define kVToG   53278      // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
-#define kUToB   132201     // 2.018 = 255 / 112 * 0.886
-#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF)
-#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF)
-#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF)
-
-static WEBP_INLINE uint8_t VP8Clip8(int v) {
-  return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> YUV_FIX)
-                                : (v < 0) ? 0u : 255u;
-}
-
-static WEBP_INLINE uint8_t VP8ClipN(int v, int N) {  // clip to N bits
-  return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> (YUV_FIX + (8 - N)))
-                                : (v < 0) ? 0u : (255u >> (8 - N));
-}
-
-static WEBP_INLINE int VP8YUVToR(int y, int v) {
-  return kYScale * y + kVToR * v + kRCst;
-}
-
-static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
-  return kYScale * y - kUToG * u - kVToG * v + kGCst;
-}
-
-static WEBP_INLINE int VP8YUVToB(int y, int u) {
-  return kYScale * y  + kUToB * u + kBCst;
-}
-
-static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
-                                    uint8_t* const rgb) {
-  rgb[0] = VP8Clip8(VP8YUVToR(y, v));
-  rgb[1] = VP8Clip8(VP8YUVToG(y, u, v));
-  rgb[2] = VP8Clip8(VP8YUVToB(y, u));
-}
-
-static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
-                                    uint8_t* const bgr) {
-  bgr[0] = VP8Clip8(VP8YUVToB(y, u));
-  bgr[1] = VP8Clip8(VP8YUVToG(y, u, v));
-  bgr[2] = VP8Clip8(VP8YUVToR(y, v));
-}
-
-static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
-                                       uint8_t* const rgb) {
-  const int r = VP8Clip8(VP8YUVToR(y, u));
-  const int g = VP8ClipN(VP8YUVToG(y, u, v), 6);
-  const int b = VP8ClipN(VP8YUVToB(y, v), 5);
-  const uint8_t rg = (r & 0xf8) | (g >> 3);
-  const uint8_t gb = (g << 5) | b;
-#ifdef WEBP_SWAP_16BIT_CSP
-  rgb[0] = gb;
-  rgb[1] = rg;
-#else
-  rgb[0] = rg;
-  rgb[1] = gb;
-#endif
-}
-
-static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
-                                         uint8_t* const argb) {
-  const int r = VP8Clip8(VP8YUVToR(y, u));
-  const int g = VP8ClipN(VP8YUVToG(y, u, v), 4);
-  const int b = VP8Clip8(VP8YUVToB(y, v));
-  const uint8_t rg = (r & 0xf0) | g;
-  const uint8_t ba = b | 0x0f;   // overwrite the lower 4 bits
-#ifdef WEBP_SWAP_16BIT_CSP
-  argb[0] = ba;
-  argb[1] = rg;
-#else
-  argb[0] = rg;
-  argb[1] = ba;
-#endif
-}
-
 #endif  // WEBP_YUV_USE_TABLE
 
+//-----------------------------------------------------------------------------
+// Alpha handling variants
+
 static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
                                      uint8_t* const argb) {
   argb[0] = 0xff;
@@ -228,6 +240,28 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
 // Must be called before everything, to initialize the tables.
 void VP8YUVInit(void);
 
+//-----------------------------------------------------------------------------
+// SSE2 extra functions (mostly for upsampling_sse2.c)
+
+#if defined(WEBP_USE_SSE2)
+
+#ifdef FANCY_UPSAMPLING
+// Process 32 pixels and store the result (24b or 32b per pixel) in *dst.
+extern void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                           uint8_t* dst);
+extern void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                          uint8_t* dst);
+extern void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                           uint8_t* dst);
+extern void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                          uint8_t* dst);
+#endif  // FANCY_UPSAMPLING
+
+// Must be called to initialize tables before using the functions.
+extern void VP8YUVInitSSE2(void);
+
+#endif    // WEBP_USE_SSE2
+
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion
 
@@ -257,7 +291,7 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
 #else
 
 // This JPEG-YUV colorspace, only for comparison!
-// These are also 16-bit precision coefficients from Rec.601, but with full
+// These are also 16bit precision coefficients from Rec.601, but with full
 // [0..255] output range.
 static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
   const int kRound = (1 << (YUV_FIX - 1));