Merge "DecodeImageData(): change the incorrect assert" into 0.5.1

2016-06-17 06:07:44 +00:00 · 2016-06-17 06:07:44 +00:00 · e7b917726f
commit e7b917726f
parent cfbcc5ece0 2abfa54f95
14 changed files with 842 additions and 105 deletions
--- a/Android.mk
+++ b/Android.mk
@ -49,6 +49,7 @@ dsp_dec_srcs := \
    src/dsp/dec_clip_tables.c \
    src/dsp/dec_mips32.c \
    src/dsp/dec_mips_dsp_r2.c \
    src/dsp/dec_msa.c \
    src/dsp/dec_neon.$(NEON) \
    src/dsp/dec_sse2.c \
    src/dsp/dec_sse41.c \
--- a/Makefile.vc
+++ b/Makefile.vc
@ -194,6 +194,7 @@ DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\dec_clip_tables.obj \
    $(DIROBJ)\dsp\dec_mips32.obj \
    $(DIROBJ)\dsp\dec_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\dec_msa.obj \
    $(DIROBJ)\dsp\dec_neon.obj \
    $(DIROBJ)\dsp\dec_sse2.obj \
    $(DIROBJ)\dsp\dec_sse41.obj \
--- a/build.gradle
+++ b/build.gradle
@ -38,12 +38,20 @@ model {
      architecture "x86_64"
    }
    mips32r2
    mips32r5
    mips64r6
  }
  toolChains {
    gcc(Gcc) {
      target("mips32r2") {
        cCompiler.args "-mips32r2"
      }
      target("mips32r5") {
        cCompiler.args "-mips32r5"
      }
      target("mips64r6") {
        cCompiler.args "-mips64r6"
      }
    }
  }
  binaries {
@ -111,6 +119,7 @@ model {
            include "dec_clip_tables.c"
            include "dec_mips32.c"
            include "dec_mips_dsp_r2.c"
            include "dec_msa.c"
            include "dec_neon.$NEON"
            include "dec_sse2.c"
            include "dec_sse41.c"
--- a/configure.ac
+++ b/configure.ac
@ -68,6 +68,7 @@ TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-fvisibility=hidden])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wall])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wdeclaration-after-statement])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wextra])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wfloat-conversion])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat -Wformat-nonliteral])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat -Wformat-security])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-declarations])
--- a/makefile.unix
+++ b/makefile.unix
@ -138,6 +138,7 @@ DSP_DEC_OBJS = \
    src/dsp/dec_clip_tables.o \
    src/dsp/dec_mips32.o \
    src/dsp/dec_mips_dsp_r2.o \
    src/dsp/dec_msa.o \
    src/dsp/dec_neon.o \
    src/dsp/dec_sse2.o \
    src/dsp/dec_sse41.o \
@ -273,6 +274,7 @@ HDRS = \
    src/dsp/dsp.h \
    src/dsp/lossless.h \
    src/dsp/mips_macro.h \
    src/dsp/msa_macro.h \
    src/dsp/neon.h \
    src/dsp/yuv.h \
    src/enc/backward_references.h \
--- a/src/dec/vp8l.c
+++ b/src/dec/vp8l.c
@ -1058,7 +1058,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
  const int mask = hdr->huffman_mask_;
  const HTreeGroup* htree_group =
      (src < src_last) ? GetHtreeGroupForPos(hdr, col, row) : NULL;
-  assert(src < src_end);
+  assert(dec->last_row_ < last_row);
  assert(src_last <= src_end);
  while (src < src_last) {
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -2,6 +2,7 @@ noinst_LTLIBRARIES = libwebpdsp.la libwebpdsp_avx2.la
 noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la
 noinst_LTLIBRARIES += libwebpdsp_sse41.la libwebpdspdecode_sse41.la
 noinst_LTLIBRARIES += libwebpdsp_neon.la libwebpdspdecode_neon.la
 noinst_LTLIBRARIES += libwebpdspdecode_msa.la
 if BUILD_LIBWEBPDECODER
  noinst_LTLIBRARIES += libwebpdspdecode.la
@ -80,6 +81,12 @@ libwebpdspdecode_neon_la_SOURCES += upsampling_neon.c
 libwebpdspdecode_neon_la_CPPFLAGS = $(libwebpdsp_neon_la_CPPFLAGS)
 libwebpdspdecode_neon_la_CFLAGS = $(libwebpdsp_neon_la_CFLAGS)
 libwebpdspdecode_msa_la_SOURCES =
 libwebpdspdecode_msa_la_SOURCES += dec_msa.c
 libwebpdspdecode_msa_la_SOURCES += msa_macro.h
 libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdspdecode_msa_la_CFLAGS = $(AM_CFLAGS)
 libwebpdsp_sse2_la_SOURCES =
 libwebpdsp_sse2_la_SOURCES += argb_sse2.c
 libwebpdsp_sse2_la_SOURCES += cost_sse2.c
@ -117,6 +124,7 @@ libwebpdsp_la_LIBADD =
 libwebpdsp_la_LIBADD += libwebpdsp_avx2.la libwebpdsp_sse2.la
 libwebpdsp_la_LIBADD += libwebpdsp_sse41.la
 libwebpdsp_la_LIBADD += libwebpdsp_neon.la
 libwebpdsp_la_LIBADD += libwebpdspdecode_msa.la
 if BUILD_LIBWEBPDECODER
  libwebpdspdecode_la_SOURCES = $(COMMON_SOURCES)
@ -127,4 +135,5 @@ if BUILD_LIBWEBPDECODER
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse2.la
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse41.la
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_neon.la
  libwebpdspdecode_la_LIBADD += libwebpdspdecode_msa.la
 endif
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@ -699,6 +699,7 @@ extern void VP8DspInitSSE41(void);
 extern void VP8DspInitNEON(void);
 extern void VP8DspInitMIPS32(void);
 extern void VP8DspInitMIPSdspR2(void);
 extern void VP8DspInitMSA(void);
 static volatile VP8CPUInfo dec_last_cpuinfo_used =
    (VP8CPUInfo)&dec_last_cpuinfo_used;
@ -783,6 +784,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      VP8DspInitMIPSdspR2();
    }
 #endif
 #if defined(WEBP_USE_MSA)
    if (VP8GetCPUInfo(kMSA)) {
      VP8DspInitMSA();
    }
 #endif
  }
  dec_last_cpuinfo_used = VP8GetCPUInfo;
--- a/src/dsp/dec_msa.c
+++ b/src/dsp/dec_msa.c
@ -0,0 +1,172 @@
 // Copyright 2016 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // MSA version of dsp functions
 //
 // Author(s):  Prashant Patil   (prashant.patil@imgtec.com)
 #include "./dsp.h"
 #if defined(WEBP_USE_MSA)
 #include "./msa_macro.h"
 //------------------------------------------------------------------------------
 // Transforms
 #define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) {  \
  v4i32 a1_m, b1_m, c1_m, d1_m;                                  \
  v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
  const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091);           \
  const v4i32 sinpi8sqrt2 = __msa_fill_w(35468);                 \
                                                                 \
  a1_m = in0 + in2;                                              \
  b1_m = in0 - in2;                                              \
  c_tmp1_m = (in1 * sinpi8sqrt2) >> 16;                          \
  c_tmp2_m = in3 + ((in3 * cospi8sqrt2minus1) >> 16);            \
  c1_m = c_tmp1_m - c_tmp2_m;                                    \
  d_tmp1_m = in1 + ((in1 * cospi8sqrt2minus1) >> 16);            \
  d_tmp2_m = (in3 * sinpi8sqrt2) >> 16;                          \
  d1_m = d_tmp1_m + d_tmp2_m;                                    \
  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
 }
 #define MULT1(a) ((((a) * 20091) >> 16) + (a))
 #define MULT2(a) (((a) * 35468) >> 16)
 static void TransformOne(const int16_t* in, uint8_t* dst) {
  v8i16 input0, input1;
  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
  v4i32 res0, res1, res2, res3;
  const v16i8 zero = { 0 };
  v16i8 dest0, dest1, dest2, dest3;
  LD_SH2(in, 8, input0, input1);
  UNPCK_SH_SW(input0, in0, in1);
  UNPCK_SH_SW(input1, in2, in3);
  IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
  IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
  SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
  TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
  LD_SB4(dst, BPS, dest0, dest1, dest2, dest3);
  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
             res0, res1, res2, res3);
  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
             res0, res1, res2, res3);
  ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
  CLIP_SW4_0_255(res0, res1, res2, res3);
  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
 }
 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
  TransformOne(in, dst);
  if (do_two) {
    TransformOne(in + 16, dst + 4);
  }
 }
 static void TransformWHT(const int16_t* in, int16_t* out) {
  v8i16 input0, input1;
  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
  v8i16 tmp0, tmp1, tmp2, tmp3;
  v8i16 out0, out1;
  LD_SH2(in, 8, input0, input1);
  input1 = SLDI_SH(input1, input1, 8);
  tmp0 = input0 + input1;
  tmp1 = input0 - input1;
  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
  out0 = tmp2 + tmp3;
  out1 = tmp2 - tmp3;
  VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1);
  tmp0 = input0 + input1;
  tmp1 = input0 - input1;
  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
  tmp0 = tmp2 + tmp3;
  tmp1 = tmp2 - tmp3;
  ADDVI_H2_SH(tmp0, 3, tmp1, 3, out0, out1);
  SRAI_H2_SH(out0, out1, 3);
  out[0] = __msa_copy_s_h(out0, 0);
  out[16] = __msa_copy_s_h(out0, 4);
  out[32] = __msa_copy_s_h(out1, 0);
  out[48] = __msa_copy_s_h(out1, 4);
  out[64] = __msa_copy_s_h(out0, 1);
  out[80] = __msa_copy_s_h(out0, 5);
  out[96] = __msa_copy_s_h(out1, 1);
  out[112] = __msa_copy_s_h(out1, 5);
  out[128] = __msa_copy_s_h(out0, 2);
  out[144] = __msa_copy_s_h(out0, 6);
  out[160] = __msa_copy_s_h(out1, 2);
  out[176] = __msa_copy_s_h(out1, 6);
  out[192] = __msa_copy_s_h(out0, 3);
  out[208] = __msa_copy_s_h(out0, 7);
  out[224] = __msa_copy_s_h(out1, 3);
  out[240] = __msa_copy_s_h(out1, 7);
 }
 static void TransformDC(const int16_t* in, uint8_t* dst) {
  const int DC = (in[0] + 4) >> 3;
  const v8i16 tmp0 = __msa_fill_h(DC);
  ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
 }
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
  const int a = in[0] + 4;
  const int c4 = MULT2(in[4]);
  const int d4 = MULT1(in[4]);
  const int in2 = MULT2(in[1]);
  const int in3 = MULT1(in[1]);
  v4i32 tmp0 = { 0 };
  v4i32 out0 = __msa_fill_w(a + d4);
  v4i32 out1 = __msa_fill_w(a + c4);
  v4i32 out2 = __msa_fill_w(a - c4);
  v4i32 out3 = __msa_fill_w(a - d4);
  v4i32 res0, res1, res2, res3;
  const v4i32 zero = { 0 };
  v16u8 dest0, dest1, dest2, dest3;
  INSERT_W4_SW(in3, in2, -in2, -in3, tmp0);
  ADD4(out0, tmp0, out1, tmp0, out2, tmp0, out3, tmp0,
       out0, out1, out2, out3);
  SRAI_W4_SW(out0, out1, out2, out3, 3);
  LD_UB4(dst, BPS, dest0, dest1, dest2, dest3);
  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
             res0, res1, res2, res3);
  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
             res0, res1, res2, res3);
  ADD4(res0, out0, res1, out1, res2, out2, res3, out3, res0, res1, res2, res3);
  CLIP_SW4_0_255(res0, res1, res2, res3);
  PCKEV_B2_SW(res0, res1, res2, res3, out0, out1);
  res0 = (v4i32)__msa_pckev_b((v16i8)out0, (v16i8)out1);
  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
 }
 //------------------------------------------------------------------------------
 // Entry point
 extern void VP8DspInitMSA(void);
 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {
  VP8TransformWHT = TransformWHT;
  VP8Transform = TransformTwo;
  VP8TransformDC = TransformDC;
  VP8TransformAC3 = TransformAC3;
 }
 #else  // !WEBP_USE_MSA
 WEBP_DSP_INIT_STUB(VP8DspInitMSA)
 #endif  // WEBP_USE_MSA
--- a/src/dsp/msa_macro.h
+++ b/src/dsp/msa_macro.h
@ -0,0 +1,555 @@
 // Copyright 2016 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // MSA common macros
 //
 // Author(s):  Prashant Patil   (prashant.patil@imgtec.com)
 #ifndef WEBP_DSP_MSA_MACRO_H_
 #define WEBP_DSP_MSA_MACRO_H_
 #include <stdint.h>
 #include <msa.h>
 #if defined(__clang__)
  #define CLANG_BUILD
 #endif
 #ifdef CLANG_BUILD
  #define ADDVI_H(a, b)  __msa_addvi_h((v8i16)a, b)
  #define SRAI_H(a, b)  __msa_srai_h((v8i16)a, b)
  #define SRAI_W(a, b)  __msa_srai_w((v4i32)a, b)
 #else
  #define ADDVI_H(a, b)  (a + b)
  #define SRAI_H(a, b)  (a >> b)
  #define SRAI_W(a, b)  (a >> b)
 #endif
 #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc))
 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
 #define LD_H(RTYPE, psrc) *((RTYPE*)(psrc))
 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
 #define LD_W(RTYPE, psrc) *((RTYPE*)(psrc))
 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
 #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
 #define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
 #define ST_W(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
 #define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME)             \
  static inline TYPE FUNC_NAME(const void* const psrc) {  \
    const uint8_t* const psrc_m = (const uint8_t*)psrc;   \
    TYPE val_m;                                           \
    asm volatile (                                        \
      "" #INSTR " %[val_m], %[psrc_m]  \n\t"              \
      : [val_m] "=r" (val_m)                              \
      : [psrc_m] "m" (*psrc_m));                          \
    return val_m;                                         \
  }
 #define MSA_LOAD(psrc, FUNC_NAME)  FUNC_NAME(psrc)
 #define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME)               \
  static inline void FUNC_NAME(TYPE val, void* const pdst) { \
    uint8_t* const pdst_m = (uint8_t*)pdst;                  \
    TYPE val_m = val;                                        \
    asm volatile (                                           \
      " " #INSTR "  %[val_m],  %[pdst_m]  \n\t"              \
      : [pdst_m] "=m" (*pdst_m)                              \
      : [val_m] "r" (val_m));                                \
  }
 #define MSA_STORE(val, pdst, FUNC_NAME)  FUNC_NAME(val, pdst)
 #if (__mips_isa_rev >= 6)
  MSA_LOAD_FUNC(uint16_t, lh, msa_lh);
  #define LH(psrc)  MSA_LOAD(psrc, msa_lh)
  MSA_LOAD_FUNC(uint32_t, lw, msa_lw);
  #define LW(psrc)  MSA_LOAD(psrc, msa_lw)
  #if (__mips == 64)
    MSA_LOAD_FUNC(uint64_t, ld, msa_ld);
    #define LD(psrc)  MSA_LOAD(psrc, msa_ld)
  #else  // !(__mips == 64)
    #define LD(psrc)  ((((uint64_t)MSA_LOAD(psrc + 4, msa_lw)) << 32) | \
                       MSA_LOAD(psrc, msa_lw))
  #endif  // (__mips == 64)
  MSA_STORE_FUNC(uint16_t, sh, msa_sh);
  #define SH(val, pdst)  MSA_STORE(val, pdst, msa_sh)
  MSA_STORE_FUNC(uint32_t, sw, msa_sw);
  #define SW(val, pdst)  MSA_STORE(val, pdst, msa_sw)
  MSA_STORE_FUNC(uint64_t, sd, msa_sd);
  #define SD(val, pdst)  MSA_STORE(val, pdst, msa_sd)
 #else  // !(__mips_isa_rev >= 6)
  MSA_LOAD_FUNC(uint16_t, ulh, msa_ulh);
  #define LH(psrc)  MSA_LOAD(psrc, msa_ulh)
  MSA_LOAD_FUNC(uint32_t, ulw, msa_ulw);
  #define LW(psrc)  MSA_LOAD(psrc, msa_ulw)
  #if (__mips == 64)
    MSA_LOAD_FUNC(uint64_t, uld, msa_uld);
    #define LD(psrc)  MSA_LOAD(psrc, msa_uld)
  #else  // !(__mips == 64)
    #define LD(psrc)  ((((uint64_t)MSA_LOAD(psrc + 4, msa_ulw)) << 32) | \
                        MSA_LOAD(psrc, msa_ulw))
  #endif  // (__mips == 64)
  MSA_STORE_FUNC(uint16_t, ush, msa_ush);
  #define SH(val, pdst)  MSA_STORE(val, pdst, msa_ush)
  MSA_STORE_FUNC(uint32_t, usw, msa_usw);
  #define SW(val, pdst)  MSA_STORE(val, pdst, msa_usw)
  #define SD(val, pdst) {                                                  \
    uint8_t* const pdst_sd_m = (uint8_t*)(pdst);                           \
    const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF);          \
    const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF);  \
    SW(val0_m, pdst_sd_m);                                                 \
    SW(val1_m, pdst_sd_m + 4);                                             \
  }
 #endif  // (__mips_isa_rev >= 6)
 /* Description : Load 4 words with stride
 * Arguments   : Inputs  - psrc, stride
 *               Outputs - out0, out1, out2, out3
 * Details     : Load word in 'out0' from (psrc)
 *               Load word in 'out1' from (psrc + stride)
 *               Load word in 'out2' from (psrc + 2 * stride)
 *               Load word in 'out3' from (psrc + 3 * stride)
 */
 #define LW4(psrc, stride, out0, out1, out2, out3) {  \
  const uint8_t* ptmp = (const uint8_t*)psrc;        \
  out0 = LW(ptmp);                                   \
  ptmp += stride;                                    \
  out1 = LW(ptmp);                                   \
  ptmp += stride;                                    \
  out2 = LW(ptmp);                                   \
  ptmp += stride;                                    \
  out3 = LW(ptmp);                                   \
 }
 /* Description : Store 4 words with stride
 * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
 * Details     : Store word from 'in0' to (pdst)
 *               Store word from 'in1' to (pdst + stride)
 *               Store word from 'in2' to (pdst + 2 * stride)
 *               Store word from 'in3' to (pdst + 3 * stride)
 */
 #define SW4(in0, in1, in2, in3, pdst, stride) {  \
  uint8_t* ptmp = (uint8_t*)pdst;                \
  SW(in0, ptmp);                                 \
  ptmp += stride;                                \
  SW(in1, ptmp);                                 \
  ptmp += stride;                                \
  SW(in2, ptmp);                                 \
  ptmp += stride;                                \
  SW(in3, ptmp);                                 \
 }
 /* Description : Load vectors with 16 byte elements with stride
 * Arguments   : Inputs  - psrc, stride
 *               Outputs - out0, out1
 *               Return Type - as per RTYPE
 * Details     : Load 16 byte elements in 'out0' from (psrc)
 *               Load 16 byte elements in 'out1' from (psrc + stride)
 */
 #define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
  out0 = LD_B(RTYPE, psrc);                       \
  out1 = LD_B(RTYPE, psrc + stride);              \
 }
 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
  LD_B2(RTYPE, psrc, stride, out0, out1);                     \
  LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3);       \
 }
 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
 /* Description : Load vectors with 8 halfword elements with stride
 * Arguments   : Inputs  - psrc, stride
 *               Outputs - out0, out1
 * Details     : Load 8 halfword elements in 'out0' from (psrc)
 *               Load 8 halfword elements in 'out1' from (psrc + stride)
 */
 #define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
  out0 = LD_H(RTYPE, psrc);                       \
  out1 = LD_H(RTYPE, psrc + stride);              \
 }
 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
 /* Description : Store 4x4 byte block to destination memory from input vector
 * Arguments   : Inputs - in0, in1, pdst, stride
 * Details     : 'Idx0' word element from input vector 'in0' is copied to the
 *               GP register and stored to (pdst)
 *               'Idx1' word element from input vector 'in0' is copied to the
 *               GP register and stored to (pdst + stride)
 *               'Idx2' word element from input vector 'in0' is copied to the
 *               GP register and stored to (pdst + 2 * stride)
 *               'Idx3' word element from input vector 'in0' is copied to the
 *               GP register and stored to (pdst + 3 * stride)
 */
 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
  uint8_t* const pblk_4x4_m = (uint8_t*)pdst;                       \
  const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0);         \
  const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1);         \
  const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2);         \
  const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3);         \
  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
 }
 /* Description : Immediate number of elements to slide
 * Arguments   : Inputs  - in0, in1, slide_val
 *               Outputs - out
 *               Return Type - as per RTYPE
 * Details     : Byte elements from 'in1' vector are slid into 'in0' by
 *               value specified in the 'slide_val'
 */
 #define SLDI_B(RTYPE, in0, in1, slide_val)                      \
        (RTYPE)__msa_sldi_b((v16i8)in0, (v16i8)in1, slide_val)  \
 #define SLDI_UB(...) SLDI_B(v16u8, __VA_ARGS__)
 #define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__)
 #define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__)
 /* Description : Shuffle halfword vector elements as per mask vector
 * Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
 *               Outputs - out0, out1
 *               Return Type - as per RTYPE
 * Details     : halfword elements from 'in0' & 'in1' are copied selectively to
 *               'out0' as per control vector 'mask0'
 */
 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
  out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0);     \
  out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2);     \
 }
 #define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__)
 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
 /* Description : Clips all signed halfword elements of input vector
 *               between 0 & 255
 * Arguments   : Input/output  - val
 *               Return Type - signed halfword
 */
 #define CLIP_SH_0_255(val) {                      \
  const v8i16 max_m = __msa_ldi_h(255);           \
  val = __msa_maxi_s_h((v8i16)val, 0);            \
  val = __msa_min_s_h(max_m, (v8i16)val);         \
 }
 #define CLIP_SH2_0_255(in0, in1) {  \
  CLIP_SH_0_255(in0);               \
  CLIP_SH_0_255(in1);               \
 }
 /* Description : Clips all signed word elements of input vector
 *               between 0 & 255
 * Arguments   : Input/output  - val
 *               Return Type - signed word
 */
 #define CLIP_SW_0_255(val) {                      \
  const v4i32 max_m = __msa_ldi_w(255);           \
  val = __msa_maxi_s_w((v4i32)val, 0);            \
  val = __msa_min_s_w(max_m, (v4i32)val);         \
 }
 #define CLIP_SW4_0_255(in0, in1, in2, in3) {  \
  CLIP_SW_0_255(in0);                         \
  CLIP_SW_0_255(in1);                         \
  CLIP_SW_0_255(in2);                         \
  CLIP_SW_0_255(in3);                         \
 }
 /* Description : Set element n input vector to GPR value
 * Arguments   : Inputs - in0, in1, in2, in3
 *               Output - out
 *               Return Type - as per RTYPE
 * Details     : Set element 0 in vector 'out' to value specified in 'in0'
 */
 #define INSERT_W2(RTYPE, in0, in1, out) {           \
  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
 }
 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \
  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \
 }
 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
 /* Description : Interleave right half of byte elements from vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1
 *               Return Type - as per RTYPE
 * Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
 *               and written to out0.
 */
 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
 }
 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
                out0, out1, out2, out3) {                       \
  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
 }
 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
 /* Description : Interleave right half of halfword elements from vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1
 *               Return Type - as per RTYPE
 * Details     : Right half of halfword elements of 'in0' and 'in1' are
 *               interleaved and written to 'out0'.
 */
 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
 }
 #define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__)
 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
                out0, out1, out2, out3) {                       \
  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
 }
 #define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__)
 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
 /* Description : Interleave right half of double word elements from vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1
 *               Return Type - as per RTYPE
 * Details     : Right half of double word elements of 'in0' and 'in1' are
 *               interleaved and written to 'out0'.
 */
 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
  out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1);     \
  out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3);     \
 }
 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
  out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
 }
 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
 #define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
  out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
 }
 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
 /* Description : Pack even byte elements of vector pairs
 *  Arguments   : Inputs  - in0, in1, in2, in3
 *                Outputs - out0, out1
 *                Return Type - as per RTYPE
 *  Details     : Even byte elements of 'in0' are copied to the left half of
 *                'out0' & even byte elements of 'in1' are copied to the right
 *                half of 'out0'.
 */
 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
 }
 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
 /* Description : Arithmetic immediate shift right all elements of word vector
 * Arguments   : Inputs  - in0, in1, shift
 *               Outputs - in place operation
 *               Return Type - as per input vector RTYPE
 * Details     : Each element of vector 'in0' is right shifted by 'shift' and
 *               the result is written in-place. 'shift' is a GP variable.
 */
 #define SRAI_W2(RTYPE, in0, in1, shift_val) {  \
  in0 = (RTYPE)SRAI_W(in0, shift_val);         \
  in1 = (RTYPE)SRAI_W(in1, shift_val);         \
 }
 #define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__)
 #define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__)
 #define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) {  \
  SRAI_W2(RTYPE, in0, in1, shift_val);                   \
  SRAI_W2(RTYPE, in2, in3, shift_val);                   \
 }
 #define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__)
 #define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__)
 /* Description : Arithmetic shift right all elements of half-word vector
 * Arguments   : Inputs  - in0, in1, shift
 *               Outputs - in place operation
 *               Return Type - as per input vector RTYPE
 * Details     : Each element of vector 'in0' is right shifted by 'shift' and
 *               the result is written in-place. 'shift' is a GP variable.
 */
 #define SRAI_H2(RTYPE, in0, in1, shift_val) {  \
  in0 = (RTYPE)SRAI_H(in0, shift_val);         \
  in1 = (RTYPE)SRAI_H(in1, shift_val);         \
 }
 #define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__)
 #define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__)
 /* Description : Arithmetic rounded shift right all elements of word vector
 * Arguments   : Inputs  - in0, in1, shift
 *               Outputs - in place operation
 *               Return Type - as per input vector RTYPE
 * Details     : Each element of vector 'in0' is right shifted by 'shift' and
 *               the result is written in-place. 'shift' is a GP variable.
 */
 #define SRARI_W2(RTYPE, in0, in1, shift) {        \
  in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
  in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
 }
 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
  SRARI_W2(RTYPE, in0, in1, shift);                   \
  SRARI_W2(RTYPE, in2, in3, shift);                   \
 }
 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
 #define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__)
 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
 /* Description : Addition of 2 pairs of half-word vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1
 * Details     : Each element in 'in0' is added to 'in1' and result is written
 *               to 'out0'.
 */
 #define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
  out0 = (RTYPE)ADDVI_H(in0, in1);                         \
  out1 = (RTYPE)ADDVI_H(in2, in3);                         \
 }
 #define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__)
 #define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__)
 /* Description : Addition of 2 pairs of vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1
 * Details     : Each element in 'in0' is added to 'in1' and result is written
 *               to 'out0'.
 */
 #define ADD2(in0, in1, in2, in3, out0, out1) {  \
  out0 = in0 + in1;                             \
  out1 = in2 + in3;                             \
 }
 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
             out0, out1, out2, out3) {                \
  ADD2(in0, in1, in2, in3, out0, out1);               \
  ADD2(in4, in5, in6, in7, out2, out3);               \
 }
 /* Description : Sign extend halfword elements from input vector and return
 *               the result in pair of vectors
 * Arguments   : Input   - in            (halfword vector)
 *               Outputs - out0, out1   (sign extended word vectors)
 *               Return Type - signed word
 * Details     : Sign bit of halfword elements from input vector 'in' is
 *               extracted and interleaved right with same vector 'in0' to
 *               generate 4 signed word elements in 'out0'
 *               Then interleaved left with same vector 'in0' to
 *               generate 4 signed word elements in 'out1'
 */
 #define UNPCK_SH_SW(in, out0, out1) {                 \
  const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0);   \
  ILVRL_H2_SW(tmp_m, in, out0, out1);                 \
 }
 /* Description : Butterfly of 4 input vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1, out2, out3
 * Details     : Butterfly operation
 */
 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
  out0 = in0 + in3;                                                \
  out1 = in1 + in2;                                                \
  out2 = in1 - in2;                                                \
  out3 = in0 - in3;                                                \
 }
 /* Description : Transpose 4x4 block with word elements in vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *                Outputs - out0, out1, out2, out3
 *                Return Type - as per RTYPE
 */
 #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
  v4i32 s0_m, s1_m, s2_m, s3_m;                                              \
  ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                         \
  ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                         \
  out0 = (RTYPE)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                      \
  out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                      \
  out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                      \
  out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                      \
 }
 #define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__)
 /* Description : Add block 4x4
 * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
 * Details     : Least significant 4 bytes from each input vector are added to
 *               the destination bytes, clipped between 0-255 and stored.
 */
 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
  uint32_t src0_m, src1_m, src2_m, src3_m;                      \
  v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
  v16i8 dst0_m = { 0 };                                         \
  v16i8 dst1_m = { 0 };                                         \
  const v16i8 zero_m = { 0 };                                   \
  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m);               \
  LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);            \
  INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
  INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
  CLIP_SH2_0_255(res0_m, res1_m);                               \
  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
 }
 #endif  /* WEBP_DSP_MSA_MACRO_H_ */
--- a/src/enc/vp8l.c
+++ b/src/enc/vp8l.c
@ -126,54 +126,8 @@ static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
                                   int low_effort,
                                   uint32_t palette[MAX_PALETTE_SIZE],
                                   int* const palette_size) {
-  int i, x, y, key;
+  const int num_colors = WebPGetColorPalette(pic, palette);
-  int num_colors = 0;
+  if (num_colors > MAX_PALETTE_SIZE) return 0;
  uint8_t in_use[MAX_PALETTE_SIZE * 4] = { 0 };
  uint32_t colors[MAX_PALETTE_SIZE * 4];
  static const uint32_t kHashMul = 0x1e35a7bd;
  const uint32_t* argb = pic->argb;
  const int width = pic->width;
  const int height = pic->height;
  uint32_t last_pix = ~argb[0];   // so we're sure that last_pix != argb[0]
  for (y = 0; y < height; ++y) {
    for (x = 0; x < width; ++x) {
      if (argb[x] == last_pix) {
        continue;
      }
      last_pix = argb[x];
      key = (kHashMul * last_pix) >> PALETTE_KEY_RIGHT_SHIFT;
      while (1) {
        if (!in_use[key]) {
          colors[key] = last_pix;
          in_use[key] = 1;
          ++num_colors;
          if (num_colors > MAX_PALETTE_SIZE) {
            return 0;
          }
          break;
        } else if (colors[key] == last_pix) {
          // The color is already there.
          break;
        } else {
          // Some other color sits there.
          // Do linear conflict resolution.
          ++key;
          key &= (MAX_PALETTE_SIZE * 4 - 1);  // key mask for 1K buffer.
        }
      }
    }
    argb += pic->argb_stride;
  }
  // TODO(skal): could we reuse in_use[] to speed up EncodePalette()?
  num_colors = 0;
  for (i = 0; i < (int)(sizeof(in_use) / sizeof(in_use[0])); ++i) {
    if (in_use[i]) {
      palette[num_colors] = colors[i];
      ++num_colors;
    }
  }
  *palette_size = num_colors;
  qsort(palette, num_colors, sizeof(*palette), PaletteCompareColorsForQsort);
  if (!low_effort && PaletteHasNonMonotonousDeltas(palette, num_colors)) {
--- a/src/mux/anim_encode.c
+++ b/src/mux/anim_encode.c
@ -646,61 +646,6 @@ static int IsLossyBlendingPossible(const WebPPicture* const src,
  return 1;
 }
 #define MIN_COLORS_LOSSY     31  // Don't try lossy below this threshold.
 #define MAX_COLORS_LOSSLESS 194  // Don't try lossless above this threshold.
 #define MAX_COLOR_COUNT     256  // Power of 2 greater than MAX_COLORS_LOSSLESS.
 #define HASH_SIZE (MAX_COLOR_COUNT * 4)
 #define HASH_RIGHT_SHIFT     22  // 32 - log2(HASH_SIZE).
 // TODO(urvang): Also used in enc/vp8l.c. Move to utils.
 // If the number of colors in the 'pic' is at least MAX_COLOR_COUNT, return
 // MAX_COLOR_COUNT. Otherwise, return the exact number of colors in the 'pic'.
 static int GetColorCount(const WebPPicture* const pic) {
  int x, y;
  int num_colors = 0;
  uint8_t in_use[HASH_SIZE] = { 0 };
  uint32_t colors[HASH_SIZE];
  static const uint32_t kHashMul = 0x1e35a7bd;
  const uint32_t* argb = pic->argb;
  const int width = pic->width;
  const int height = pic->height;
  uint32_t last_pix = ~argb[0];   // so we're sure that last_pix != argb[0]
  for (y = 0; y < height; ++y) {
    for (x = 0; x < width; ++x) {
      int key;
      if (argb[x] == last_pix) {
        continue;
      }
      last_pix = argb[x];
      key = (kHashMul * last_pix) >> HASH_RIGHT_SHIFT;
      while (1) {
        if (!in_use[key]) {
          colors[key] = last_pix;
          in_use[key] = 1;
          ++num_colors;
          if (num_colors >= MAX_COLOR_COUNT) {
            return MAX_COLOR_COUNT;  // Exact count not needed.
          }
          break;
        } else if (colors[key] == last_pix) {
          break;  // The color is already there.
        } else {
          // Some other color sits here, so do linear conflict resolution.
          ++key;
          key &= (HASH_SIZE - 1);  // Key mask.
        }
      }
    }
    argb += pic->argb_stride;
  }
  return num_colors;
 }
 #undef MAX_COLOR_COUNT
 #undef HASH_SIZE
 #undef HASH_RIGHT_SHIFT
 // For pixels in 'rect', replace those pixels in 'dst' that are same as 'src' by
 // transparent pixels.
 // Returns true if at least one pixel gets modified.
@ -864,6 +809,9 @@ enum {
  CANDIDATE_COUNT
 };
 #define MIN_COLORS_LOSSY     31  // Don't try lossy below this threshold.
 #define MAX_COLORS_LOSSLESS 194  // Don't try lossless above this threshold.
 // Generates candidates for a given dispose method given pre-filled sub-frame
 // 'params'.
 static WebPEncodingError GenerateCandidates(
@ -898,7 +846,7 @@ static WebPEncodingError GenerateCandidates(
    candidate_ll->evaluate_ = is_lossless;
    candidate_lossy->evaluate_ = !is_lossless;
  } else {  // Use a heuristic for trying lossless and/or lossy compression.
-    const int num_colors = GetColorCount(&params->sub_frame_ll_);
+    const int num_colors = WebPGetColorPalette(&params->sub_frame_ll_, NULL);
    candidate_ll->evaluate_ = (num_colors < MAX_COLORS_LOSSLESS);
    candidate_lossy->evaluate_ = (num_colors >= MIN_COLORS_LOSSY);
  }
--- a/src/utils/utils.c
+++ b/src/utils/utils.c
@ -15,6 +15,7 @@
 #include <string.h>  // for memcpy()
 #include "../webp/decode.h"
 #include "../webp/encode.h"
 #include "../webp/format_constants.h"  // for MAX_PALETTE_SIZE
 #include "./utils.h"
 // If PRINT_MEM_INFO is defined, extra info (like total memory used, number of
@ -237,3 +238,68 @@ void WebPCopyPixels(const WebPPicture* const src, WebPPicture* const dst) {
 }
 //------------------------------------------------------------------------------
 #define MAX_COLOR_COUNT         MAX_PALETTE_SIZE
 #define COLOR_HASH_SIZE         (MAX_COLOR_COUNT * 4)
 #define COLOR_HASH_RIGHT_SHIFT  22  // 32 - log2(COLOR_HASH_SIZE).
 int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
  int i;
  int x, y;
  int num_colors = 0;
  uint8_t in_use[COLOR_HASH_SIZE] = { 0 };
  uint32_t colors[COLOR_HASH_SIZE];
  static const uint32_t kHashMul = 0x1e35a7bdU;
  const uint32_t* argb = pic->argb;
  const int width = pic->width;
  const int height = pic->height;
  uint32_t last_pix = ~argb[0];   // so we're sure that last_pix != argb[0]
  assert(pic != NULL);
  assert(pic->use_argb);
  for (y = 0; y < height; ++y) {
    for (x = 0; x < width; ++x) {
      int key;
      if (argb[x] == last_pix) {
        continue;
      }
      last_pix = argb[x];
      key = (kHashMul * last_pix) >> COLOR_HASH_RIGHT_SHIFT;
      while (1) {
        if (!in_use[key]) {
          colors[key] = last_pix;
          in_use[key] = 1;
          ++num_colors;
          if (num_colors > MAX_COLOR_COUNT) {
            return MAX_COLOR_COUNT + 1;  // Exact count not needed.
          }
          break;
        } else if (colors[key] == last_pix) {
          break;  // The color is already there.
        } else {
          // Some other color sits here, so do linear conflict resolution.
          ++key;
          key &= (COLOR_HASH_SIZE - 1);  // Key mask.
        }
      }
    }
    argb += pic->argb_stride;
  }
  if (palette != NULL) {  // Fill the colors into palette.
    num_colors = 0;
    for (i = 0; i < COLOR_HASH_SIZE; ++i) {
      if (in_use[i]) {
        palette[num_colors] = colors[i];
        ++num_colors;
      }
    }
  }
  return num_colors;
 }
 #undef MAX_COLOR_COUNT
 #undef COLOR_HASH_SIZE
 #undef COLOR_HASH_RIGHT_SHIFT
 //------------------------------------------------------------------------------
--- a/src/utils/utils.h
+++ b/src/utils/utils.h
@ -160,6 +160,19 @@ WEBP_EXTERN(void) WebPCopyPlane(const uint8_t* src, int src_stride,
 WEBP_EXTERN(void) WebPCopyPixels(const struct WebPPicture* const src,
                                 struct WebPPicture* const dst);
 //------------------------------------------------------------------------------
 // Unique colors.
 // Returns count of unique colors in 'pic', assuming pic->use_argb is true.
 // If the unique color count is more than MAX_COLOR_COUNT, returns
 // MAX_COLOR_COUNT+1.
 // If 'palette' is not NULL and number of unique colors is less than or equal to
 // MAX_COLOR_COUNT, also outputs the actual unique colors into 'palette'.
 // Note: 'palette' is assumed to be an array already allocated with at least
 // MAX_COLOR_COUNT elements.
 WEBP_EXTERN(int) WebPGetColorPalette(const struct WebPPicture* const pic,
                                     uint32_t* const palette);
 //------------------------------------------------------------------------------
 #ifdef __cplusplus