dsp/lossless: split enc/dec functions

adds lossless_enc*.c; reduces the size of the decode-only so: ~78K w/gcc-4.8.2 on x86_64. Change-Id: If5e4610b67d05eba5896bc64bab79e9df92b2092
2015-03-20 19:09:49 -07:00 · 2015-03-20 19:09:49 -07:00 · 553051f741
commit 553051f741
parent 9064adc8a8
15 changed files with 1884 additions and 1597 deletions
--- a/Android.mk
+++ b/Android.mk
@ -53,7 +53,6 @@ dsp_dec_srcs := \
    src/dsp/filters_mips_dsp_r2.c \
    src/dsp/filters_sse2.c \
    src/dsp/lossless.c \
-    src/dsp/lossless_mips32.c \
    src/dsp/lossless_mips_dsp_r2.c \
    src/dsp/lossless_neon.$(NEON) \
    src/dsp/lossless_sse2.c \
@ -80,6 +79,11 @@ dsp_enc_srcs := \
    src/dsp/enc_mips_dsp_r2.c \
    src/dsp/enc_neon.$(NEON) \
    src/dsp/enc_sse2.c \
+    src/dsp/lossless_enc.c \
+    src/dsp/lossless_enc_mips32.c \
+    src/dsp/lossless_enc_mips_dsp_r2.c \
+    src/dsp/lossless_enc_neon.$(NEON) \
+    src/dsp/lossless_enc_sse2.c \

 enc_srcs := \
    src/enc/alpha.c \
--- a/Makefile.vc
+++ b/Makefile.vc
@ -200,7 +200,6 @@ DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\filters_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\filters_sse2.obj \
    $(DIROBJ)\dsp\lossless.obj \
-    $(DIROBJ)\dsp\lossless_mips32.obj \
    $(DIROBJ)\dsp\lossless_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\lossless_neon.obj \
    $(DIROBJ)\dsp\lossless_sse2.obj \
@ -230,6 +229,11 @@ DSP_ENC_OBJS = \
    $(DIROBJ)\dsp\enc_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\enc_neon.obj \
    $(DIROBJ)\dsp\enc_sse2.obj \
+    $(DIROBJ)\dsp\lossless_enc.obj \
+    $(DIROBJ)\dsp\lossless_enc_mips32.obj \
+    $(DIROBJ)\dsp\lossless_enc_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\lossless_enc_neon.obj \
+    $(DIROBJ)\dsp\lossless_enc_sse2.obj \

 EX_FORMAT_DEC_OBJS = \
    $(DIROBJ)\examples\jpegdec.obj \
--- a/makefile.unix
+++ b/makefile.unix
@ -130,7 +130,6 @@ DSP_DEC_OBJS = \
    src/dsp/filters_mips_dsp_r2.o \
    src/dsp/filters_sse2.o \
    src/dsp/lossless.o \
-    src/dsp/lossless_mips32.o \
    src/dsp/lossless_mips_dsp_r2.o \
    src/dsp/lossless_neon.o \
    src/dsp/lossless_sse2.o \
@ -160,6 +159,11 @@ DSP_ENC_OBJS = \
    src/dsp/enc_mips_dsp_r2.o \
    src/dsp/enc_neon.o \
    src/dsp/enc_sse2.o \
+    src/dsp/lossless_enc.o \
+    src/dsp/lossless_enc_mips32.o \
+    src/dsp/lossless_enc_mips_dsp_r2.o \
+    src/dsp/lossless_enc_neon.o \
+    src/dsp/lossless_enc_sse2.o \

 ENC_OBJS = \
    src/enc/alpha.o \
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -23,7 +23,6 @@ COMMON_SOURCES += filters.c
 COMMON_SOURCES += filters_mips_dsp_r2.c
 COMMON_SOURCES += lossless.c
 COMMON_SOURCES += lossless.h
-COMMON_SOURCES += lossless_mips32.c
 COMMON_SOURCES += lossless_mips_dsp_r2.c
 COMMON_SOURCES += lossless_neon.c
 COMMON_SOURCES += mips_macro.h
@ -49,6 +48,10 @@ ENC_SOURCES += enc.c
 ENC_SOURCES += enc_mips32.c
 ENC_SOURCES += enc_mips_dsp_r2.c
 ENC_SOURCES += enc_neon.c
+ENC_SOURCES += lossless_enc.c
+ENC_SOURCES += lossless_enc_mips32.c
+ENC_SOURCES += lossless_enc_mips_dsp_r2.c
+ENC_SOURCES += lossless_enc_neon.c

 libwebpdsp_avx2_la_SOURCES =
 libwebpdsp_avx2_la_SOURCES += enc_avx2.c
@ -75,6 +78,7 @@ libwebpdsp_sse2_la_SOURCES =
 libwebpdsp_sse2_la_SOURCES += argb_sse2.c
 libwebpdsp_sse2_la_SOURCES += cost_sse2.c
 libwebpdsp_sse2_la_SOURCES += enc_sse2.c
+libwebpdsp_sse2_la_SOURCES += lossless_enc_sse2.c
 libwebpdsp_sse2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_sse2_la_CFLAGS = $(AM_CFLAGS) $(SSE2_FLAGS)
 libwebpdsp_sse2_la_LIBADD = libwebpdspdecode_sse2.la
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@ -89,6 +89,7 @@ void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);

 // Must be called before calling any of the above methods.
 void VP8LDspInit(void);
+void VP8LEncDspInit(void);

 //------------------------------------------------------------------------------
 // Image transforms.
--- a/src/dsp/lossless_enc.c
+++ b/src/dsp/lossless_enc.c
--- a/src/dsp/lossless_enc_mips32.c
+++ b/src/dsp/lossless_enc_mips32.c
@ -1,4 +1,4 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
+// Copyright 2015 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
@ -399,9 +399,9 @@ static void HistogramAdd(const VP8LHistogram* const a,
 //------------------------------------------------------------------------------
 // Entry point

-extern void VP8LDspInitMIPS32(void);
+extern void VP8LEncDspInitMIPS32(void);

-WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPS32(void) {
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
  VP8LFastSLog2Slow = FastSLog2Slow;
  VP8LFastLog2Slow = FastLog2Slow;
  VP8LExtraCost = ExtraCost;
@ -413,7 +413,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPS32(void) {

 #else  // !WEBP_USE_MIPS32

-extern void VP8LDspInitMIPS32(void);
-WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPS32(void) {}
+extern void VP8LEncDspInitMIPS32(void);
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {}

 #endif  // WEBP_USE_MIPS32
--- a/src/dsp/lossless_enc_mips_dsp_r2.c
+++ b/src/dsp/lossless_enc_mips_dsp_r2.c
@ -0,0 +1,276 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "./lossless.h"
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data,
+                                        int num_pixels) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
+  uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
+  __asm__ volatile (
+    ".set       push                                          \n\t"
+    ".set       noreorder                                     \n\t"
+    "beq        %[argb_data],    %[p_loop1_end],     3f       \n\t"
+    " nop                                                     \n\t"
+  "0:                                                         \n\t"
+    "lw         %[temp0],        0(%[argb_data])              \n\t"
+    "lw         %[temp1],        4(%[argb_data])              \n\t"
+    "lw         %[temp2],        8(%[argb_data])              \n\t"
+    "lw         %[temp3],        12(%[argb_data])             \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "ext        %[temp5],        %[temp1],           8,    8  \n\t"
+    "ext        %[temp6],        %[temp2],           8,    8  \n\t"
+    "ext        %[temp7],        %[temp3],           8,    8  \n\t"
+    "addiu      %[argb_data],    %[argb_data],       16       \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "replv.ph   %[temp5],        %[temp5]                     \n\t"
+    "replv.ph   %[temp6],        %[temp6]                     \n\t"
+    "replv.ph   %[temp7],        %[temp7]                     \n\t"
+    "subu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "subu.qb    %[temp1],        %[temp1],           %[temp5] \n\t"
+    "subu.qb    %[temp2],        %[temp2],           %[temp6] \n\t"
+    "subu.qb    %[temp3],        %[temp3],           %[temp7] \n\t"
+    "sw         %[temp0],        -16(%[argb_data])            \n\t"
+    "sw         %[temp1],        -12(%[argb_data])            \n\t"
+    "sw         %[temp2],        -8(%[argb_data])             \n\t"
+    "bne        %[argb_data],    %[p_loop1_end],     0b       \n\t"
+    " sw        %[temp3],        -4(%[argb_data])             \n\t"
+  "3:                                                         \n\t"
+    "beq        %[argb_data],    %[p_loop2_end],     2f       \n\t"
+    " nop                                                     \n\t"
+  "1:                                                         \n\t"
+    "lw         %[temp0],        0(%[argb_data])              \n\t"
+    "addiu      %[argb_data],    %[argb_data],       4        \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "subu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "bne        %[argb_data],    %[p_loop2_end],     1b       \n\t"
+    " sw        %[temp0],        -4(%[argb_data])             \n\t"
+  "2:                                                         \n\t"
+    ".set       pop                                           \n\t"
+    : [argb_data]"+&r"(argb_data), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
+                                                int8_t color) {
+  return (uint32_t)((int)(color_pred) * color) >> 5;
+}
+
+static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
+                           int num_pixels) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  uint32_t argb, argb1, new_red, new_red1;
+  const uint32_t G_to_R = m->green_to_red_;
+  const uint32_t G_to_B = m->green_to_blue_;
+  const uint32_t R_to_B = m->red_to_blue_;
+  uint32_t* const p_loop_end = data + (num_pixels & ~1);
+  __asm__ volatile (
+    ".set            push                                    \n\t"
+    ".set            noreorder                               \n\t"
+    "beq             %[data],      %[p_loop_end],  1f        \n\t"
+    " nop                                                    \n\t"
+    "replv.ph        %[temp0],     %[G_to_R]                 \n\t"
+    "replv.ph        %[temp1],     %[G_to_B]                 \n\t"
+    "replv.ph        %[temp2],     %[R_to_B]                 \n\t"
+    "shll.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shll.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shll.ph         %[temp2],     %[temp2],       8         \n\t"
+    "shra.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shra.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shra.ph         %[temp2],     %[temp2],       8         \n\t"
+  "0:                                                        \n\t"
+    "lw              %[argb],      0(%[data])                \n\t"
+    "lw              %[argb1],     4(%[data])                \n\t"
+    "lhu             %[new_red],   2(%[data])                \n\t"
+    "lhu             %[new_red1],  6(%[data])                \n\t"
+    "precrq.qb.ph    %[temp3],     %[argb],        %[argb1]  \n\t"
+    "precr.qb.ph     %[temp4],     %[argb],        %[argb1]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[temp3]                  \n\t"
+    "preceu.ph.qbla  %[temp4],     %[temp4]                  \n\t"
+    "shll.ph         %[temp3],     %[temp3],       8         \n\t"
+    "shll.ph         %[temp4],     %[temp4],       8         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       8         \n\t"
+    "shra.ph         %[temp4],     %[temp4],       8         \n\t"
+    "mul.ph          %[temp5],     %[temp3],       %[temp0]  \n\t"
+    "mul.ph          %[temp3],     %[temp3],       %[temp1]  \n\t"
+    "mul.ph          %[temp4],     %[temp4],       %[temp2]  \n\t"
+    "addiu           %[data],      %[data],        8         \n\t"
+    "ins             %[new_red1],  %[new_red],     16,   16  \n\t"
+    "ins             %[argb1],     %[argb],        16,   16  \n\t"
+    "shra.ph         %[temp5],     %[temp5],       5         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       5         \n\t"
+    "shra.ph         %[temp4],     %[temp4],       5         \n\t"
+    "subu.ph         %[new_red1],  %[new_red1],    %[temp5]  \n\t"
+    "subu.ph         %[argb1],     %[argb1],       %[temp3]  \n\t"
+    "preceu.ph.qbra  %[temp5],     %[new_red1]               \n\t"
+    "subu.ph         %[argb1],     %[argb1],       %[temp4]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[argb1]                  \n\t"
+    "sb              %[temp5],     -2(%[data])               \n\t"
+    "sb              %[temp3],     -4(%[data])               \n\t"
+    "sra             %[temp5],     %[temp5],       16        \n\t"
+    "sra             %[temp3],     %[temp3],       16        \n\t"
+    "sb              %[temp5],     -6(%[data])               \n\t"
+    "bne             %[data],      %[p_loop_end],  0b        \n\t"
+    " sb             %[temp3],     -8(%[data])               \n\t"
+  "1:                                                        \n\t"
+    ".set            pop                                     \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [new_red1]"=&r"(new_red1), [new_red]"=&r"(new_red),
+      [argb]"=&r"(argb), [argb1]"=&r"(argb1), [data]"+&r"(data)
+    : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
+      [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
+    : "memory", "hi", "lo"
+  );
+
+  if (num_pixels & 1) {
+    const uint32_t argb_ = data[0];
+    const uint32_t green = argb_ >> 8;
+    const uint32_t red = argb_ >> 16;
+    uint32_t new_blue = argb_;
+    new_red = red;
+    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue &= 0xff;
+    data[0] = (argb_ & 0xff00ff00u) | (new_red << 16) | (new_blue);
+  }
+}
+
+static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
+                                              uint8_t red_to_blue,
+                                              uint32_t argb) {
+  const uint32_t green = argb >> 8;
+  const uint32_t red = argb >> 16;
+  uint8_t new_blue = argb;
+  new_blue -= ColorTransformDelta(green_to_blue, green);
+  new_blue -= ColorTransformDelta(red_to_blue, red);
+  return (new_blue & 0xff);
+}
+
+static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
+                                       int tile_width, int tile_height,
+                                       int green_to_blue, int red_to_blue,
+                                       int histo[]) {
+  const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
+  const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
+  const uint32_t mask = 0xff00ffu;
+  while (tile_height-- > 0) {
+    int x;
+    const uint32_t* p_argb = argb;
+    argb += stride;
+    for (x = 0; x < (tile_width >> 1); ++x) {
+      int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+      __asm__ volatile (
+        "lw           %[temp0],  0(%[p_argb])             \n\t"
+        "lw           %[temp1],  4(%[p_argb])             \n\t"
+        "precr.qb.ph  %[temp2],  %[temp0],  %[temp1]      \n\t"
+        "ins          %[temp1],  %[temp0],  16,    16     \n\t"
+        "shra.ph      %[temp2],  %[temp2],  8             \n\t"
+        "shra.ph      %[temp3],  %[temp1],  8             \n\t"
+        "mul.ph       %[temp5],  %[temp2],  %[rtb]        \n\t"
+        "mul.ph       %[temp6],  %[temp3],  %[gtb]        \n\t"
+        "and          %[temp4],  %[temp1],  %[mask]       \n\t"
+        "addiu        %[p_argb], %[p_argb], 8             \n\t"
+        "shra.ph      %[temp5],  %[temp5],  5             \n\t"
+        "shra.ph      %[temp6],  %[temp6],  5             \n\t"
+        "subu.qb      %[temp2],  %[temp4],  %[temp5]      \n\t"
+        "subu.qb      %[temp2],  %[temp2],  %[temp6]      \n\t"
+        : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+          [temp5]"=&r"(temp5), [temp6]"=&r"(temp6)
+        : [rtb]"r"(rtb), [gtb]"r"(gtb), [mask]"r"(mask)
+        : "memory", "hi", "lo"
+      );
+      ++histo[(uint8_t)(temp2 >> 16)];
+      ++histo[(uint8_t)temp2];
+    }
+    if (tile_width & 1) {
+      ++histo[TransformColorBlue(green_to_blue, red_to_blue, *p_argb)];
+    }
+  }
+}
+
+static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
+                                             uint32_t argb) {
+  const uint32_t green = argb >> 8;
+  uint32_t new_red = argb >> 16;
+  new_red -= ColorTransformDelta(green_to_red, green);
+  return (new_red & 0xff);
+}
+
+static void CollectColorRedTransforms(const uint32_t* argb, int stride,
+                                      int tile_width, int tile_height,
+                                      int green_to_red, int histo[]) {
+  const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
+  while (tile_height-- > 0) {
+    int x;
+    const uint32_t* p_argb = argb;
+    argb += stride;
+    for (x = 0; x < (tile_width >> 1); ++x) {
+      int temp0, temp1, temp2, temp3, temp4;
+      __asm__ volatile (
+        "lw           %[temp0],  0(%[p_argb])             \n\t"
+        "lw           %[temp1],  4(%[p_argb])             \n\t"
+        "precrq.ph.w  %[temp4],  %[temp0],  %[temp1]      \n\t"
+        "ins          %[temp1],  %[temp0],  16,    16     \n\t"
+        "shra.ph      %[temp3],  %[temp1],  8             \n\t"
+        "mul.ph       %[temp2],  %[temp3],  %[gtr]        \n\t"
+        "addiu        %[p_argb], %[p_argb], 8             \n\t"
+        "shra.ph      %[temp2],  %[temp2],  5             \n\t"
+        "subu.qb      %[temp2],  %[temp4],  %[temp2]      \n\t"
+        : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
+        : [gtr]"r"(gtr)
+        : "memory", "hi", "lo"
+      );
+      ++histo[(uint8_t)(temp2 >> 16)];
+      ++histo[(uint8_t)temp2];
+    }
+    if (tile_width & 1) {
+      ++histo[TransformColorRed(green_to_red, *p_argb)];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LTransformColor = TransformColor;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+extern void VP8LEncDspInitMIPSdspR2(void);
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {}
+
+#endif  // WEBP_USE_MIPS_DSP_R2
--- a/src/dsp/lossless_enc_neon.c
+++ b/src/dsp/lossless_enc_neon.c
@ -0,0 +1,93 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
+#include "./lossless.h"
+#include "./neon.h"
+
+#ifdef WEBP_USE_INTRINSICS
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+// non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+#define USE_VTBLQ
+#endif
+
+#ifdef USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[16] = {
+  1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
+};
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+                                             const uint8x16_t shuffle) {
+  return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
+                     vtbl1q_u8(argb, vget_high_u8(shuffle)));
+}
+#else  // !USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+                                             const uint8x8_t shuffle) {
+  return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+                     vtbl1_u8(vget_high_u8(argb), shuffle));
+}
+#endif  // USE_VTBLQ
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
+  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
+  for (; argb_data < end; argb_data += 4) {
+    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
+    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
+}
+
+#undef USE_VTBLQ
+
+#endif   // WEBP_USE_INTRINSICS
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
+#ifdef WEBP_USE_INTRINSICS
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+#endif
+}
+
+#else  // !WEBP_USE_NEON
+
+extern void VP8LEncDspInitNEON(void);
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {}
+
+#endif  // WEBP_USE_NEON
--- a/src/dsp/lossless_enc_sse2.c
+++ b/src/dsp/lossless_enc_sse2.c
@ -0,0 +1,194 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <assert.h>
+#include <emmintrin.h>
+#include "./lossless.h"
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const __m128i mask = _mm_set1_epi32(0x0000ff00);
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
+    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
+    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
+    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
+    const __m128i out = _mm_sub_epi8(in, in_0g0g);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+static WEBP_INLINE __m128i ColorTransformDelta(__m128i color_pred,
+                                               __m128i color) {
+  // We simulate signed 8-bit multiplication as:
+  // * Left shift the two (8-bit) numbers by 8 bits,
+  // * Perform a 16-bit signed multiplication and retain the higher 16-bits.
+  const __m128i color_pred_shifted = _mm_slli_epi32(color_pred, 8);
+  const __m128i color_shifted = _mm_slli_epi32(color, 8);
+  // Note: This performs multiplication on 8 packed 16-bit numbers, 4 of which
+  // happen to be zeroes.
+  const __m128i signed_mult =
+      _mm_mulhi_epi16(color_pred_shifted, color_shifted);
+  return _mm_srli_epi32(signed_mult, 5);
+}
+
+static WEBP_INLINE void TransformColor(const VP8LMultipliers* const m,
+                                       uint32_t* argb_data,
+                                       int num_pixels) {
+  const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_);       // multipliers
+  const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
+  const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);
+
+  int i;
+
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00);  // masks
+    const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
+    const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
+    const __m128i lower_8bit_mask  = _mm_set1_epi32(0x000000ff);
+    const __m128i ag = _mm_and_si128(in, alpha_green_mask);      // alpha, green
+    const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
+    const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
+    const __m128i b = in;
+
+    const __m128i r_delta = ColorTransformDelta(g_to_r, g);      // red
+    const __m128i r_new =
+        _mm_and_si128(_mm_sub_epi32(r, r_delta), lower_8bit_mask);
+    const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);
+
+    const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g);    // blue
+    const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r);
+    const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
+    const __m128i b_new =
+        _mm_and_si128(_mm_sub_epi32(b, b_delta), lower_8bit_mask);
+
+    const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+
+  // Fall-back to C-version for left-overs.
+  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+}
+
+//------------------------------------------------------------------------------
+
+#define LINE_SIZE 16    // 8 or 16
+static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
+                      int size) {
+  int i;
+  assert(size % LINE_SIZE == 0);
+  for (i = 0; i < size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+}
+
+static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
+  int i;
+  assert(size % LINE_SIZE == 0);
+  for (i = 0; i < size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+}
+#undef LINE_SIZE
+
+// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
+// that's ok since the histogram values are less than 1<<28 (max picture size).
+static void HistogramAdd(const VP8LHistogram* const a,
+                         const VP8LHistogram* const b,
+                         VP8LHistogram* const out) {
+  int i;
+  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+  if (b != out) {
+    AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
+    AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
+  } else {
+    AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
+    AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
+  }
+  for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
+    out->literal_[i] = a->literal_[i] + b->literal_[i];
+  }
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    out->distance_[i] = a->distance_[i] + b->distance_[i];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LTransformColor = TransformColor;
+  VP8LHistogramAdd = HistogramAdd;
+}
+
+#else  // !WEBP_USE_SSE2
+
+extern void VP8LEncDspInitSSE2(void);
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {}
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/lossless_mips_dsp_r2.c
+++ b/src/dsp/lossless_mips_dsp_r2.c
@ -145,61 +145,6 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
  return temp1;
 }

-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data,
-                                        int num_pixels) {
-  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-  uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
-  uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
-  __asm__ volatile (
-    ".set       push                                          \n\t"
-    ".set       noreorder                                     \n\t"
-    "beq        %[argb_data],    %[p_loop1_end],     3f       \n\t"
-    " nop                                                     \n\t"
-  "0:                                                         \n\t"
-    "lw         %[temp0],        0(%[argb_data])              \n\t"
-    "lw         %[temp1],        4(%[argb_data])              \n\t"
-    "lw         %[temp2],        8(%[argb_data])              \n\t"
-    "lw         %[temp3],        12(%[argb_data])             \n\t"
-    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
-    "ext        %[temp5],        %[temp1],           8,    8  \n\t"
-    "ext        %[temp6],        %[temp2],           8,    8  \n\t"
-    "ext        %[temp7],        %[temp3],           8,    8  \n\t"
-    "addiu      %[argb_data],    %[argb_data],       16       \n\t"
-    "replv.ph   %[temp4],        %[temp4]                     \n\t"
-    "replv.ph   %[temp5],        %[temp5]                     \n\t"
-    "replv.ph   %[temp6],        %[temp6]                     \n\t"
-    "replv.ph   %[temp7],        %[temp7]                     \n\t"
-    "subu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
-    "subu.qb    %[temp1],        %[temp1],           %[temp5] \n\t"
-    "subu.qb    %[temp2],        %[temp2],           %[temp6] \n\t"
-    "subu.qb    %[temp3],        %[temp3],           %[temp7] \n\t"
-    "sw         %[temp0],        -16(%[argb_data])            \n\t"
-    "sw         %[temp1],        -12(%[argb_data])            \n\t"
-    "sw         %[temp2],        -8(%[argb_data])             \n\t"
-    "bne        %[argb_data],    %[p_loop1_end],     0b       \n\t"
-    " sw        %[temp3],        -4(%[argb_data])             \n\t"
-  "3:                                                         \n\t"
-    "beq        %[argb_data],    %[p_loop2_end],     2f       \n\t"
-    " nop                                                     \n\t"
-  "1:                                                         \n\t"
-    "lw         %[temp0],        0(%[argb_data])              \n\t"
-    "addiu      %[argb_data],    %[argb_data],       4        \n\t"
-    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
-    "replv.ph   %[temp4],        %[temp4]                     \n\t"
-    "subu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
-    "bne        %[argb_data],    %[p_loop2_end],     1b       \n\t"
-    " sw        %[temp0],        -4(%[argb_data])             \n\t"
-  "2:                                                         \n\t"
-    ".set       pop                                           \n\t"
-    : [argb_data]"+&r"(argb_data), [temp0]"=&r"(temp0),
-      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
-      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
-      [temp7]"=&r"(temp7)
-    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
-    : "memory"
-  );
-}
-
 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
  int temp0, temp1, temp2, temp3, temp4, temp5;
  __asm__ volatile (
@ -280,189 +225,6 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
  return ClampedAddSubtractHalf(left, top[0], top[-1]);
 }

-static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
-                                                int8_t color) {
-  return (uint32_t)((int)(color_pred) * color) >> 5;
-}
-
-static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
-                           int num_pixels) {
-  int temp0, temp1, temp2, temp3, temp4, temp5;
-  uint32_t argb, argb1, new_red, new_red1;
-  const uint32_t G_to_R = m->green_to_red_;
-  const uint32_t G_to_B = m->green_to_blue_;
-  const uint32_t R_to_B = m->red_to_blue_;
-  uint32_t* const p_loop_end = data + (num_pixels & ~1);
-  __asm__ volatile (
-    ".set            push                                    \n\t"
-    ".set            noreorder                               \n\t"
-    "beq             %[data],      %[p_loop_end],  1f        \n\t"
-    " nop                                                    \n\t"
-    "replv.ph        %[temp0],     %[G_to_R]                 \n\t"
-    "replv.ph        %[temp1],     %[G_to_B]                 \n\t"
-    "replv.ph        %[temp2],     %[R_to_B]                 \n\t"
-    "shll.ph         %[temp0],     %[temp0],       8         \n\t"
-    "shll.ph         %[temp1],     %[temp1],       8         \n\t"
-    "shll.ph         %[temp2],     %[temp2],       8         \n\t"
-    "shra.ph         %[temp0],     %[temp0],       8         \n\t"
-    "shra.ph         %[temp1],     %[temp1],       8         \n\t"
-    "shra.ph         %[temp2],     %[temp2],       8         \n\t"
-  "0:                                                        \n\t"
-    "lw              %[argb],      0(%[data])                \n\t"
-    "lw              %[argb1],     4(%[data])                \n\t"
-    "lhu             %[new_red],   2(%[data])                \n\t"
-    "lhu             %[new_red1],  6(%[data])                \n\t"
-    "precrq.qb.ph    %[temp3],     %[argb],        %[argb1]  \n\t"
-    "precr.qb.ph     %[temp4],     %[argb],        %[argb1]  \n\t"
-    "preceu.ph.qbra  %[temp3],     %[temp3]                  \n\t"
-    "preceu.ph.qbla  %[temp4],     %[temp4]                  \n\t"
-    "shll.ph         %[temp3],     %[temp3],       8         \n\t"
-    "shll.ph         %[temp4],     %[temp4],       8         \n\t"
-    "shra.ph         %[temp3],     %[temp3],       8         \n\t"
-    "shra.ph         %[temp4],     %[temp4],       8         \n\t"
-    "mul.ph          %[temp5],     %[temp3],       %[temp0]  \n\t"
-    "mul.ph          %[temp3],     %[temp3],       %[temp1]  \n\t"
-    "mul.ph          %[temp4],     %[temp4],       %[temp2]  \n\t"
-    "addiu           %[data],      %[data],        8         \n\t"
-    "ins             %[new_red1],  %[new_red],     16,   16  \n\t"
-    "ins             %[argb1],     %[argb],        16,   16  \n\t"
-    "shra.ph         %[temp5],     %[temp5],       5         \n\t"
-    "shra.ph         %[temp3],     %[temp3],       5         \n\t"
-    "shra.ph         %[temp4],     %[temp4],       5         \n\t"
-    "subu.ph         %[new_red1],  %[new_red1],    %[temp5]  \n\t"
-    "subu.ph         %[argb1],     %[argb1],       %[temp3]  \n\t"
-    "preceu.ph.qbra  %[temp5],     %[new_red1]               \n\t"
-    "subu.ph         %[argb1],     %[argb1],       %[temp4]  \n\t"
-    "preceu.ph.qbra  %[temp3],     %[argb1]                  \n\t"
-    "sb              %[temp5],     -2(%[data])               \n\t"
-    "sb              %[temp3],     -4(%[data])               \n\t"
-    "sra             %[temp5],     %[temp5],       16        \n\t"
-    "sra             %[temp3],     %[temp3],       16        \n\t"
-    "sb              %[temp5],     -6(%[data])               \n\t"
-    "bne             %[data],      %[p_loop_end],  0b        \n\t"
-    " sb             %[temp3],     -8(%[data])               \n\t"
-  "1:                                                        \n\t"
-    ".set            pop                                     \n\t"
-    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
-      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
-      [new_red1]"=&r"(new_red1), [new_red]"=&r"(new_red),
-      [argb]"=&r"(argb), [argb1]"=&r"(argb1), [data]"+&r"(data)
-    : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
-      [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
-    : "memory", "hi", "lo"
-  );
-
-  if (num_pixels & 1) {
-    const uint32_t argb_ = data[0];
-    const uint32_t green = argb_ >> 8;
-    const uint32_t red = argb_ >> 16;
-    uint32_t new_blue = argb_;
-    new_red = red;
-    new_red -= ColorTransformDelta(m->green_to_red_, green);
-    new_red &= 0xff;
-    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
-    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
-    new_blue &= 0xff;
-    data[0] = (argb_ & 0xff00ff00u) | (new_red << 16) | (new_blue);
-  }
-}
-
-static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
-                                              uint8_t red_to_blue,
-                                              uint32_t argb) {
-  const uint32_t green = argb >> 8;
-  const uint32_t red = argb >> 16;
-  uint8_t new_blue = argb;
-  new_blue -= ColorTransformDelta(green_to_blue, green);
-  new_blue -= ColorTransformDelta(red_to_blue, red);
-  return (new_blue & 0xff);
-}
-
-static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
-                                       int tile_width, int tile_height,
-                                       int green_to_blue, int red_to_blue,
-                                       int histo[]) {
-  const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
-  const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
-  const uint32_t mask = 0xff00ffu;
-  while (tile_height-- > 0) {
-    int x;
-    const uint32_t* p_argb = argb;
-    argb += stride;
-    for (x = 0; x < (tile_width >> 1); ++x) {
-      int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
-      __asm__ volatile (
-        "lw           %[temp0],  0(%[p_argb])             \n\t"
-        "lw           %[temp1],  4(%[p_argb])             \n\t"
-        "precr.qb.ph  %[temp2],  %[temp0],  %[temp1]      \n\t"
-        "ins          %[temp1],  %[temp0],  16,    16     \n\t"
-        "shra.ph      %[temp2],  %[temp2],  8             \n\t"
-        "shra.ph      %[temp3],  %[temp1],  8             \n\t"
-        "mul.ph       %[temp5],  %[temp2],  %[rtb]        \n\t"
-        "mul.ph       %[temp6],  %[temp3],  %[gtb]        \n\t"
-        "and          %[temp4],  %[temp1],  %[mask]       \n\t"
-        "addiu        %[p_argb], %[p_argb], 8             \n\t"
-        "shra.ph      %[temp5],  %[temp5],  5             \n\t"
-        "shra.ph      %[temp6],  %[temp6],  5             \n\t"
-        "subu.qb      %[temp2],  %[temp4],  %[temp5]      \n\t"
-        "subu.qb      %[temp2],  %[temp2],  %[temp6]      \n\t"
-        : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
-          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
-          [temp5]"=&r"(temp5), [temp6]"=&r"(temp6)
-        : [rtb]"r"(rtb), [gtb]"r"(gtb), [mask]"r"(mask)
-        : "memory", "hi", "lo"
-      );
-      ++histo[(uint8_t)(temp2 >> 16)];
-      ++histo[(uint8_t)temp2];
-    }
-    if (tile_width & 1) {
-      ++histo[TransformColorBlue(green_to_blue, red_to_blue, *p_argb)];
-    }
-  }
-}
-
-static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
-                                             uint32_t argb) {
-  const uint32_t green = argb >> 8;
-  uint32_t new_red = argb >> 16;
-  new_red -= ColorTransformDelta(green_to_red, green);
-  return (new_red & 0xff);
-}
-
-static void CollectColorRedTransforms(const uint32_t* argb, int stride,
-                                      int tile_width, int tile_height,
-                                      int green_to_red, int histo[]) {
-  const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
-  while (tile_height-- > 0) {
-    int x;
-    const uint32_t* p_argb = argb;
-    argb += stride;
-    for (x = 0; x < (tile_width >> 1); ++x) {
-      int temp0, temp1, temp2, temp3, temp4;
-      __asm__ volatile (
-        "lw           %[temp0],  0(%[p_argb])             \n\t"
-        "lw           %[temp1],  4(%[p_argb])             \n\t"
-        "precrq.ph.w  %[temp4],  %[temp0],  %[temp1]      \n\t"
-        "ins          %[temp1],  %[temp0],  16,    16     \n\t"
-        "shra.ph      %[temp3],  %[temp1],  8             \n\t"
-        "mul.ph       %[temp2],  %[temp3],  %[gtr]        \n\t"
-        "addiu        %[p_argb], %[p_argb], 8             \n\t"
-        "shra.ph      %[temp2],  %[temp2],  5             \n\t"
-        "subu.qb      %[temp2],  %[temp4],  %[temp2]      \n\t"
-        : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
-          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
-        : [gtr]"r"(gtr)
-        : "memory", "hi", "lo"
-      );
-      ++histo[(uint8_t)(temp2 >> 16)];
-      ++histo[(uint8_t)temp2];
-    }
-    if (tile_width & 1) {
-      ++histo[TransformColorRed(green_to_red, *p_argb)];
-    }
-  }
-}
-
 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
 static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) {
@ -902,10 +664,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
  VP8LPredictors[11] = Predictor11;
  VP8LPredictors[12] = Predictor12;
  VP8LPredictors[13] = Predictor13;
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
-  VP8LTransformColor = TransformColor;
-  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
-  VP8LCollectColorRedTransforms = CollectColorRedTransforms;
  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
  VP8LTransformColorInverse = TransformColorInverse;
  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
--- a/src/dsp/lossless_neon.c
+++ b/src/dsp/lossless_neon.c
@ -288,22 +288,6 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
 }
 #endif  // USE_VTBLQ

-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
-  const uint32_t* const end = argb_data + (num_pixels & ~3);
-#ifdef USE_VTBLQ
-  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
-#else
-  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
-#endif
-  for (; argb_data < end; argb_data += 4) {
-    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
-    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
-    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
-  }
-  // fallthrough and finish off with plain-C
-  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
-}
-
 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const uint32_t* const end = argb_data + (num_pixels & ~3);
 #ifdef USE_VTBLQ
@ -345,7 +329,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
  VP8LPredictors[12] = Predictor12;
  VP8LPredictors[13] = Predictor13;

-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
 #endif
 }
--- a/src/dsp/lossless_sse2.c
+++ b/src/dsp/lossless_sse2.c
@ -155,22 +155,6 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
 //------------------------------------------------------------------------------
 // Subtract-Green Transform

-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
-  const __m128i mask = _mm_set1_epi32(0x0000ff00);
-  int i;
-  for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
-    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
-    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
-    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
-    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
-    const __m128i out = _mm_sub_epi8(in, in_0g0g);
-    _mm_storeu_si128((__m128i*)&argb_data[i], out);
-  }
-  // fallthrough and finish off with plain-C
-  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
-}
-
 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const __m128i mask = _mm_set1_epi32(0x0000ff00);
  int i;
@ -204,45 +188,6 @@ static WEBP_INLINE __m128i ColorTransformDelta(__m128i color_pred,
  return _mm_srli_epi32(signed_mult, 5);
 }

-static WEBP_INLINE void TransformColor(const VP8LMultipliers* const m,
-                                       uint32_t* argb_data,
-                                       int num_pixels) {
-  const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_);       // multipliers
-  const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
-  const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);
-
-  int i;
-
-  for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
-    const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00);  // masks
-    const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
-    const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
-    const __m128i lower_8bit_mask  = _mm_set1_epi32(0x000000ff);
-    const __m128i ag = _mm_and_si128(in, alpha_green_mask);      // alpha, green
-    const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
-    const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
-    const __m128i b = in;
-
-    const __m128i r_delta = ColorTransformDelta(g_to_r, g);      // red
-    const __m128i r_new =
-        _mm_and_si128(_mm_sub_epi32(r, r_delta), lower_8bit_mask);
-    const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);
-
-    const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g);    // blue
-    const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r);
-    const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
-    const __m128i b_new =
-        _mm_and_si128(_mm_sub_epi32(b, b_delta), lower_8bit_mask);
-
-    const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
-    _mm_storeu_si128((__m128i*)&argb_data[i], out);
-  }
-
-  // Fall-back to C-version for left-overs.
-  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
-}
-
 static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m,
                                              uint32_t* argb_data,
                                              int num_pixels) {
@ -416,88 +361,6 @@ static void ConvertBGRAToBGR(const uint32_t* src,
  VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
 }

-//------------------------------------------------------------------------------
-
-#define LINE_SIZE 16    // 8 or 16
-static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
-                      int size) {
-  int i;
-  assert(size % LINE_SIZE == 0);
-  for (i = 0; i < size; i += LINE_SIZE) {
-    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
-    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
-#if (LINE_SIZE == 16)
-    const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
-    const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
-#endif
-    const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i +  0]);
-    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i +  4]);
-#if (LINE_SIZE == 16)
-    const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i +  8]);
-    const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
-#endif
-    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
-    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
-#if (LINE_SIZE == 16)
-    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
-    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
-#endif
-  }
-}
-
-static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
-  int i;
-  assert(size % LINE_SIZE == 0);
-  for (i = 0; i < size; i += LINE_SIZE) {
-    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
-    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
-#if (LINE_SIZE == 16)
-    const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
-    const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
-#endif
-    const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i +  0]);
-    const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i +  4]);
-#if (LINE_SIZE == 16)
-    const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i +  8]);
-    const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
-#endif
-    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
-    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
-#if (LINE_SIZE == 16)
-    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
-    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
-#endif
-  }
-}
-#undef LINE_SIZE
-
-// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
-// that's ok since the histogram values are less than 1<<28 (max picture size).
-static void HistogramAdd(const VP8LHistogram* const a,
-                         const VP8LHistogram* const b,
-                         VP8LHistogram* const out) {
-  int i;
-  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
-  assert(a->palette_code_bits_ == b->palette_code_bits_);
-  if (b != out) {
-    AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
-    AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
-    AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
-    AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
-  } else {
-    AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
-    AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
-    AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
-    AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
-  }
-  for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
-    out->literal_[i] = a->literal_[i] + b->literal_[i];
-  }
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    out->distance_[i] = a->distance_[i] + b->distance_[i];
-  }
-}
-
 //------------------------------------------------------------------------------
 // Entry point

@ -514,18 +377,13 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
  VP8LPredictors[12] = Predictor12;
  VP8LPredictors[13] = Predictor13;

-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
-
-  VP8LTransformColor = TransformColor;
  VP8LTransformColorInverse = TransformColorInverse;

  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
-
-  VP8LHistogramAdd = HistogramAdd;
 }

 #else  // !WEBP_USE_SSE2
--- a/src/enc/vp8l.c
+++ b/src/enc/vp8l.c
@ -1216,7 +1216,7 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
  enc->config_ = config;
  enc->pic_ = picture;

-  VP8LDspInit();
+  VP8LEncDspInit();

  return enc;
 }