README.wasm: add instructions for enabling mulhi

Change-Id: I1e9dd737f06ad76f73824352291a6e129ca5ded1
WebPMemToUint32: remove ptr cast to int
2017-11-02 11:20:09 -07:00 · 2017-10-31 18:24:54 -07:00 · 2017-10-30 20:40:48 -07:00 · 2017-10-28 11:49:18 -07:00 · 2017-10-28 11:49:18 -07:00 · 2017-10-28 11:49:18 -07:00
93 changed files with 7255 additions and 2294 deletions
--- a/Android.mk
+++ b/Android.mk
@@ -11,12 +11,24 @@ ifeq ($(APP_OPTIM),release)
  endif
 endif

+# mips32 fails to build with clang from r14b
+# https://bugs.chromium.org/p/webp/issues/detail?id=343
+ifeq ($(findstring clang,$(NDK_TOOLCHAIN_VERSION)),clang)
+  ifeq ($(TARGET_ARCH),mips)
+    clang_version := $(shell $(TARGET_CC) --version)
+    ifneq ($(findstring clang version 3,$(clang_version)),)
+      WEBP_CFLAGS += -no-integrated-as
+    endif
+  endif
+endif
+
 ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),)
  # Setting LOCAL_ARM_NEON will enable -mfpu=neon which may cause illegal
  # instructions to be generated for armv7a code. Instead target the neon code
  # specifically.
  NEON := c.neon
  USE_CPUFEATURES := yes
+  WEBP_CFLAGS += -DHAVE_CPU_FEATURES_H
 else
  NEON := c
 endif
@@ -79,6 +91,7 @@ dsp_dec_srcs := \
    src/dsp/yuv.c \
    src/dsp/yuv_mips32.c \
    src/dsp/yuv_mips_dsp_r2.c \
+    src/dsp/yuv_neon.$(NEON) \
    src/dsp/yuv_sse2.c \

 dsp_enc_srcs := \
@@ -101,10 +114,13 @@ dsp_enc_srcs := \
    src/dsp/lossless_enc_neon.$(NEON) \
    src/dsp/lossless_enc_sse2.c \
    src/dsp/lossless_enc_sse41.c \
+    src/dsp/ssim.c \
+    src/dsp/ssim_sse2.c \

 enc_srcs := \
    src/enc/alpha_enc.c \
    src/enc/analysis_enc.c \
+    src/enc/backward_references_cost_enc.c \
    src/enc/backward_references_enc.c \
    src/enc/config_enc.c \
    src/enc/cost_enc.c \
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,13 +3,21 @@ cmake_minimum_required(VERSION 2.8.7)
 project(libwebp C)

 # Options for coder / decoder executables.
+option(WEBP_ENABLE_SIMD "Enable any SIMD optimization." ON)
+option(WEBP_ENABLE_WASM "Enable WebAssembly optimizations." OFF)
 option(WEBP_BUILD_CWEBP "Build the cwebp command line tool." OFF)
 option(WEBP_BUILD_DWEBP "Build the dwebp command line tool." OFF)
 option(WEBP_BUILD_GIF2WEBP "Build the gif2webp conversion tool." OFF)
 option(WEBP_BUILD_IMG2WEBP "Build the img2webp animation tool." OFF)
+option(WEBP_BUILD_WEBPINFO "Build the webpinfo command line tool." OFF)
+option(WEBP_BUILD_WEBP_JS "Emscripten build of webp.js." OFF)
 option(WEBP_EXPERIMENTAL_FEATURES "Build with experimental features." OFF)
 option(WEBP_ENABLE_SWAP_16BIT_CSP "Enable byte swap for 16 bit colorspaces." OFF)

+if(WEBP_BUILD_WEBP_JS OR WEBP_ENABLE_WASM)
+  set(WEBP_ENABLE_SIMD OFF)
+endif()
+
 set(WEBP_DEP_LIBRARIES)
 set(WEBP_DEP_INCLUDE_DIRS)

@@ -21,11 +29,18 @@ endif()

 include(cmake/config.h.cmake)

+# Extract the version of the library.
+file(READ ${CMAKE_CURRENT_SOURCE_DIR}/configure.ac SOURCE_FILE)
+string(REGEX MATCH "[0-9.]+" WEBP_VERSION ${SOURCE_FILE})
+
 ################################################################################
 # Options.
 if(WEBP_ENABLE_SWAP_16BIT_CSP)
  add_definitions(-DWEBP_SWAP_16BIT_CSP)
 endif()
+if(WEBP_ENABLE_WASM)
+  add_definitions(-DWEBP_USE_WASM)
+endif()

 ################################################################################
 # Android only.
@@ -39,48 +54,110 @@ if(ANDROID)
  set(WEBP_DEP_INCLUDE_DIRS ${WEBP_DEP_INCLUDE_DIRS}
    ${ANDROID_NDK}/sources/android/cpufeatures
  )
+  add_definitions(-DHAVE_CPU_FEATURES_H)
 endif()

 ################################################################################
 # WebP source files.
 # Read the Makefile.am to get the source files.

-function(parse_Makefile_am FOLDER VAR)
+# We expect the Makefiles to define the sources as defined in
+# the first regex. E.g.:
+# libimagedec_la_SOURCES  = image_dec.c image_dec.h
+function(parse_Makefile_am FOLDER VAR SRC_REGEX)
  file(READ ${FOLDER}/Makefile.am MAKEFILE_AM)
-  string(REGEX MATCHALL "_SOURCES \\+= [^\n]*"
+  string(REGEX MATCHALL "${SRC_REGEX}_SOURCES[ ]*\\+?=[ ]+[0-9a-z\\._ ]*"
    FILES_PER_LINE ${MAKEFILE_AM}
  )
  set(SRCS ${${VAR}})
  foreach(FILES ${FILES_PER_LINE})
-    string(SUBSTRING ${FILES} 12 -1 FILES)
-    string(REGEX MATCHALL "[0-9a-z\\._]+"
-      FILES ${FILES}
-    )
-    foreach(FILE ${FILES})
-      list(APPEND SRCS ${FOLDER}/${FILE})
-    endforeach()
+    string(FIND ${FILES} "=" OFFSET)
+    math(EXPR OFFSET "${OFFSET} + 2")
+    string(SUBSTRING ${FILES} ${OFFSET} -1 FILES)
+    if(FILES)
+      string(REGEX MATCHALL "[0-9a-z\\._]+"
+        FILES ${FILES}
+      )
+      foreach(FILE ${FILES})
+        list(APPEND SRCS ${FOLDER}/${FILE})
+      endforeach()
+    endif()
  endforeach()
  set(${VAR} ${SRCS} PARENT_SCOPE)
 endfunction()

-set(WEBP_SRCS)
-parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/dec "WEBP_SRCS")
-parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/demux "WEBP_SRCS")
-parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/dsp "WEBP_SRCS")
-parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/enc "WEBP_SRCS")
-parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/utils "WEBP_SRCS")
+set(WEBP_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
+parse_Makefile_am(${WEBP_SRC_DIR}/dec "WEBP_DEC_SRCS" "")
+parse_Makefile_am(${WEBP_SRC_DIR}/demux "WEBP_DEMUX_SRCS" "")
+parse_Makefile_am(${WEBP_SRC_DIR}/dsp "WEBP_DSP_COMMON_SRCS" "COMMON")
+parse_Makefile_am(${WEBP_SRC_DIR}/dsp "WEBP_DSP_ENC_SRCS" "ENC")
+parse_Makefile_am(${WEBP_SRC_DIR}/dsp "WEBP_DSP_ENC_SRCS" "dsp_[^ ]*")
+parse_Makefile_am(${WEBP_SRC_DIR}/dsp "WEBP_DSP_DEC_SRCS" "decode_[^ ]*")
+parse_Makefile_am(${WEBP_SRC_DIR}/enc "WEBP_ENC_SRCS" "")
+parse_Makefile_am(${WEBP_SRC_DIR}/utils "WEBP_UTILS_COMMON_SRCS" "COMMON")
+parse_Makefile_am(${WEBP_SRC_DIR}/utils "WEBP_UTILS_ENC_SRCS" "ENC")
+parse_Makefile_am(${WEBP_SRC_DIR}/utils "WEBP_UTILS_DEC_SRCS" "decode_[^ ]*")

 # Remove the files specific to SIMD we don't use.
 foreach(FILE ${WEBP_SIMD_FILES_NOT_TO_INCLUDE})
-  list(REMOVE_ITEM WEBP_SRCS ${FILE})
+  list(REMOVE_ITEM WEBP_DSP_ENC_SRCS ${FILE})
+  list(REMOVE_ITEM WEBP_DSP_DEC_SRCS ${FILE})
 endforeach()

-# Build the library.
+### Define the mandatory libraries.
+# Build the webpdecoder library.
 add_definitions(-Wall)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/ ${WEBP_DEP_INCLUDE_DIRS})
-add_library(webp ${WEBP_SRCS})
+add_library(webpdecode OBJECT ${WEBP_DEC_SRCS})
+add_library(webpdspdecode OBJECT ${WEBP_DSP_COMMON_SRCS} ${WEBP_DSP_DEC_SRCS})
+add_library(webputilsdecode OBJECT ${WEBP_UTILS_COMMON_SRCS}
+  ${WEBP_UTILS_DEC_SRCS})
+add_library(webpdecoder $<TARGET_OBJECTS:webpdecode>
+  $<TARGET_OBJECTS:webpdspdecode> $<TARGET_OBJECTS:webputilsdecode>)
+target_link_libraries(webpdecoder ${WEBP_DEP_LIBRARIES})
+
+# Build the webp library.
+add_library(webpencode OBJECT ${WEBP_ENC_SRCS})
+add_library(webpdsp OBJECT ${WEBP_DSP_COMMON_SRCS} ${WEBP_DSP_DEC_SRCS}
+  ${WEBP_DSP_ENC_SRCS})
+add_library(webputils OBJECT ${WEBP_UTILS_COMMON_SRCS} ${WEBP_UTILS_DEC_SRCS}
+  ${WEBP_UTILS_ENC_SRCS})
+add_library(webp $<TARGET_OBJECTS:webpdecode> $<TARGET_OBJECTS:webpdsp>
+  $<TARGET_OBJECTS:webpencode> $<TARGET_OBJECTS:webputils>)
 target_link_libraries(webp ${WEBP_DEP_LIBRARIES})

+# Make sure the OBJECT libraries are built with position independent code
+# (it is not ON by default).
+set_target_properties(webpdecode webpdspdecode webputilsdecode
+  webpencode webpdsp webputils PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+# Build the webp demux library.
+add_library(webpdemux ${WEBP_DEMUX_SRCS})
+target_link_libraries(webpdemux webp)
+
+# Set the version numbers.
+function(parse_version FILE NAME VAR)
+  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/src/${FILE} SOURCE_FILE)
+  string(REGEX MATCH "${NAME}_la_LDFLAGS[^\n]* -version-info [0-9:]+" TMP
+    ${SOURCE_FILE})
+  string(REGEX MATCH "[0-9:]+" TMP ${TMP})
+  string(REGEX REPLACE ":" "." VERSION ${TMP})
+  set(${VAR} "${VERSION}" PARENT_SCOPE)
+endfunction()
+parse_version(Makefile.am webp WEBP_WEBP_SOVERSION)
+set_target_properties(webp PROPERTIES VERSION ${WEBP_VERSION}
+  SOVERSION ${WEBP_WEBP_SOVERSION})
+parse_version(Makefile.am webpdecoder WEBP_DECODER_SOVERSION)
+set_target_properties(webpdecoder PROPERTIES VERSION ${WEBP_VERSION}
+  SOVERSION ${WEBP_DECODER_SOVERSION})
+parse_version(demux/Makefile.am webpdemux WEBP_DEMUX_SOVERSION)
+set_target_properties(webpdemux PROPERTIES VERSION ${WEBP_VERSION}
+  SOVERSION ${WEBP_DEMUX_SOVERSION})
+
+# Define the libraries to install.
+set(INSTALLED_LIBRARIES webpdecoder webp webpdemux)
+
+### Deal with SIMD.
 # Change the compile flags for SIMD files we use.
 list(LENGTH WEBP_SIMD_FILES_TO_INCLUDE WEBP_SIMD_FILES_TO_INCLUDE_LENGTH)
 math(EXPR WEBP_SIMD_FILES_TO_INCLUDE_RANGE
@@ -90,100 +167,176 @@ math(EXPR WEBP_SIMD_FILES_TO_INCLUDE_RANGE
 foreach(I_FILE RANGE ${WEBP_SIMD_FILES_TO_INCLUDE_RANGE})
  list(GET WEBP_SIMD_FILES_TO_INCLUDE ${I_FILE} FILE)
  list(GET WEBP_SIMD_FLAGS_TO_INCLUDE ${I_FILE} SIMD_COMPILE_FLAG)
-  set_source_files_properties(${FILE} PROPERTIES
-    COMPILE_FLAGS ${SIMD_COMPILE_FLAG}
-  )
+  if(NOT ${SIMD_COMPILE_FLAG} STREQUAL "NOTFOUND")
+    set_source_files_properties(${FILE} PROPERTIES
+      COMPILE_FLAGS ${SIMD_COMPILE_FLAG}
+    )
+  endif()
 endforeach()

 # Build the executables if asked for.
 if(WEBP_BUILD_CWEBP OR WEBP_BUILD_DWEBP OR
-   WEBP_BUILD_GIF2WEBP OR WEBP_BUILD_IMG2WEBP)
+   WEBP_BUILD_GIF2WEBP OR WEBP_BUILD_IMG2WEBP OR WEBP_BUILD_WEBP_JS)
  # Example utility library.
-  set(exampleutil_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/examples/stopwatch.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/examples/example_util.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/examples/example_util.h)
-  add_library(exampleutil ${exampleutil_SRCS})
-  target_link_libraries(exampleutil webp ${WEBP_DEP_LIBRARIES})
+  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "EXAMPLEUTIL_SRCS"
+    "example_util_[^ ]*")
+  list(APPEND EXAMPLEUTIL_SRCS
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/stopwatch.h)
+  add_library(exampleutil ${EXAMPLEUTIL_SRCS})

-  set(imageioutil_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/imageio_util.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/imageio_util.h)
-  add_library(imageioutil ${imageioutil_SRCS})
-  target_link_libraries(imageioutil ${WEBP_DEP_LIBRARIES})
+  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/imageio "IMAGEIOUTILS_SRCS"
+    "imageio_util_[^ ]*")
+  add_library(imageioutil ${IMAGEIOUTILS_SRCS})
+  target_link_libraries(imageioutil webp)

  # Image-decoding utility library.
-  set(imagedec_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/examples/gifdec.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/examples/gifdec.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/image_dec.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/image_dec.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/jpegdec.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/jpegdec.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/metadata.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/metadata.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/pngdec.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/pngdec.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/tiffdec.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/tiffdec.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/webpdec.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/webpdec.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/wicdec.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/wicdec.h)
-  add_library(imagedec ${imagedec_SRCS})
-  target_link_libraries(imagedec webp ${WEBP_DEP_LIBRARIES}
-    ${WEBP_DEP_IMG_LIBRARIES})
+  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/imageio "IMAGEDEC_SRCS"
+    "imagedec_[^ ]*")
+  add_library(imagedec ${IMAGEDEC_SRCS})
+  target_link_libraries(imagedec imageioutil webp ${WEBP_DEP_IMG_LIBRARIES})

  # Image-encoding utility library.
-  set(imageenc_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/image_enc.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/image_enc.h)
-  add_library(imageenc ${imageenc_SRCS})
-  target_link_libraries(imageenc webp imageioutil
-    ${WEBP_DEP_LIBRARIES} ${WEBP_DEP_IMG_LIBRARIES})
+  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/imageio "IMAGEENC_SRCS"
+    "imageenc_[^ ]*")
+  add_library(imageenc ${IMAGEENC_SRCS})
+  target_link_libraries(imageenc webp)
 endif()

 if(WEBP_BUILD_DWEBP)
  # dwebp
  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
-  add_executable(dwebp
-    ${CMAKE_CURRENT_SOURCE_DIR}/examples/dwebp.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/examples/stopwatch.h)
-  target_link_libraries(dwebp imagedec imageenc webp
-    exampleutil imageioutil
-    ${WEBP_DEP_LIBRARIES} ${WEBP_DEP_IMG_LIBRARIES}
-  )
+  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "DWEBP_SRCS"
+    "dwebp")
+  add_executable(dwebp ${DWEBP_SRCS})
+  target_link_libraries(dwebp exampleutil imagedec imageenc webpdecoder)
+  install(TARGETS dwebp RUNTIME DESTINATION bin)
 endif()

 if(WEBP_BUILD_CWEBP)
  # cwebp
  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
-  add_executable(cwebp
-    ${CMAKE_CURRENT_SOURCE_DIR}/examples/cwebp.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/examples/stopwatch.h)
-  target_link_libraries(cwebp imagedec webp exampleutil imageioutil
-    ${WEBP_DEP_LIBRARIES} ${WEBP_DEP_IMG_LIBRARIES}
-  )
+  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "CWEBP_SRCS"
+    "cwebp")
+  add_executable(cwebp ${CWEBP_SRCS})
+  target_link_libraries(cwebp exampleutil imagedec webp)
+  install(TARGETS cwebp RUNTIME DESTINATION bin)
+endif()
+
+if(WEBP_BUILD_GIF2WEBP OR WEBP_BUILD_IMG2WEBP)
+  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/mux "WEBP_MUX_SRCS"
+    "")
+  add_library(webpmux ${WEBP_MUX_SRCS})
+  target_link_libraries(webpmux webp)
+  parse_version(mux/Makefile.am webpmux WEBP_MUX_SOVERSION)
+  set_target_properties(webpmux PROPERTIES VERSION ${WEBP_VERSION}
+    SOVERSION ${WEBP_MUX_SOVERSION})
+  list(APPEND INSTALLED_LIBRARIES webpmux)
 endif()

 if(WEBP_BUILD_GIF2WEBP)
  # gif2webp
-  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
-  set(GIF2WEBP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/examples/gif2webp.c)
-  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/mux "GIF2WEBP_SRCS")
+  include_directories(${WEBP_DEP_GIF_INCLUDE_DIRS})
+  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "GIF2WEBP_SRCS"
+    "gif2webp")
  add_executable(gif2webp ${GIF2WEBP_SRCS})
-  target_link_libraries(gif2webp imagedec webp exampleutil imageioutil
-    ${WEBP_DEP_LIBRARIES} ${WEBP_DEP_IMG_LIBRARIES}
-  )
+  target_link_libraries(gif2webp exampleutil imageioutil webp webpmux
+    ${WEBP_DEP_GIF_LIBRARIES})
+  install(TARGETS gif2webp RUNTIME DESTINATION bin)
 endif()

 if(WEBP_BUILD_IMG2WEBP)
  # img2webp
  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
-  set(IMG2WEBP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/examples/img2webp.c)
-  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/mux "IMG2WEBP_SRCS")
+  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "IMG2WEBP_SRCS"
+    "img2webp")
  add_executable(img2webp ${IMG2WEBP_SRCS})
-  target_link_libraries(img2webp imagedec webp exampleutil imageioutil
-    ${WEBP_DEP_LIBRARIES} ${WEBP_DEP_IMG_LIBRARIES}
-  )
+  target_link_libraries(img2webp exampleutil imagedec imageioutil webp webpmux)
+  install(TARGETS img2webp RUNTIME DESTINATION bin)
 endif()
+
+if (WEBP_BUILD_WEBPINFO)
+  # webpinfo
+  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
+  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "WEBPINFO_SRCS"
+    "webpinfo")
+  add_executable(webpinfo ${WEBPINFO_SRCS})
+  target_link_libraries(webpinfo exampleutil imageioutil)
+  install(TARGETS webpinfo RUNTIME DESTINATION bin)
+endif()
+
+if(WEBP_BUILD_WEBP_JS)
+  # JavaScript version
+  add_executable(webp_js
+                 ${CMAKE_CURRENT_SOURCE_DIR}/extras/webp_to_sdl.c)
+  target_link_libraries(webp_js webpdecoder SDL)
+  set_target_properties(webp_js PROPERTIES LINK_FLAGS
+      "-s EXPORTED_FUNCTIONS='[\"_WebpToSDL\"]' -s INVOKE_RUN=0")
+  set_target_properties(webp_js PROPERTIES OUTPUT_NAME webp)
+  target_compile_definitions(webp_js PUBLIC EMSCRIPTEN WEBP_HAVE_SDL)
+
+  # WASM version
+  add_executable(webp_wasm
+                 ${CMAKE_CURRENT_SOURCE_DIR}/extras/webp_to_sdl.c)
+  target_link_libraries(webp_wasm webpdecoder SDL)
+  set_target_properties(webp_wasm PROPERTIES LINK_FLAGS
+      "-s WASM=1 -s 'BINARYEN_METHOD=\"native-wasm\"' \
+      -s EXPORTED_FUNCTIONS='[\"_WebpToSDL\"]' -s INVOKE_RUN=0")
+  target_compile_definitions(webp_wasm PUBLIC EMSCRIPTEN WEBP_HAVE_SDL)
+
+  target_compile_definitions(webpdecoder PUBLIC EMSCRIPTEN)
+endif()
+
+# Install the different headers and libraries.
+install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/decode.h
+              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/demux.h
+              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/encode.h
+              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux.h
+              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux_types.h
+              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h
+        DESTINATION include/webp)
+install(TARGETS ${INSTALLED_LIBRARIES}
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
+
+# Create the CMake version file.
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+  "${CMAKE_CURRENT_BINARY_DIR}/WebPConfigVersion.cmake"
+  VERSION ${WEBP_VERSION}
+  COMPATIBILITY AnyNewerVersion
+)
+
+# Create the Config file.
+include(CMakePackageConfigHelpers)
+set(ConfigPackageLocation share/WebP/cmake/)
+configure_package_config_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/WebPConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/WebPConfig.cmake
+  INSTALL_DESTINATION ${ConfigPackageLocation}
+)
+
+# Install the generated CMake files.
+install(
+  FILES "${CMAKE_CURRENT_BINARY_DIR}/WebPConfigVersion.cmake"
+        "${CMAKE_CURRENT_BINARY_DIR}/WebPConfig.cmake"
+  DESTINATION ${ConfigPackageLocation}
+)
+
+# Install the man pages.
+set(MAN_PAGES cwebp.1 dwebp.1 gif2webp.1 img2webp.1 vwebp.1 webpmux.1
+  webpinfo.1)
+set(EXEC_BUILDS "CWEBP" "DWEBP" "GIF2WEBP" "IMG2WEBP" "VWEBP" "WEBPMUX"
+  "WEBPINFO")
+list(LENGTH MAN_PAGES MAN_PAGES_LENGTH)
+math(EXPR MAN_PAGES_RANGE "${MAN_PAGES_LENGTH} - 1")
+
+foreach(I_MAN RANGE ${MAN_PAGES_RANGE})
+  list(GET EXEC_BUILDS ${I_MAN} EXEC_BUILD)
+  if(WEBP_BUILD_${EXEC_BUILD})
+    list(GET MAN_PAGES ${I_MAN} MAN_PAGE)
+    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/man/${MAN_PAGE}
+      DESTINATION ${CMAKE_INSTALL_PREFIX}/share/man/man1
+      COMPONENT doc
+    )
+  endif()
+endforeach()
--- a/2
+++ b/2
@@ -1,3 +1,5 @@
+20a7fea0 extras/Makefile.am: fix libwebpextras.la reference
+415f3ffe update ChangeLog (tag: v0.6.0-rc3)
 3c6d1224 update NEWS
 ee4a4141 update AUTHORS
 32ed856f Fix "all|no frames are keyframes" settings.
--- a/Makefile.vc
+++ b/Makefile.vc
@@ -229,6 +229,7 @@ DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\yuv.obj \
    $(DIROBJ)\dsp\yuv_mips32.obj \
    $(DIROBJ)\dsp\yuv_mips_dsp_r2.obj \
+    $(DIROBJ)\dsp\yuv_neon.obj \
    $(DIROBJ)\dsp\yuv_sse2.obj \

 DSP_ENC_OBJS = \
@@ -254,6 +255,8 @@ DSP_ENC_OBJS = \
    $(DIROBJ)\dsp\lossless_enc_neon.obj \
    $(DIROBJ)\dsp\lossless_enc_sse2.obj \
    $(DIROBJ)\dsp\lossless_enc_sse41.obj \
+    $(DIROBJ)\dsp\ssim.obj \
+    $(DIROBJ)\dsp\ssim_sse2.obj \

 EX_ANIM_UTIL_OBJS = \
    $(DIROBJ)\examples\anim_util.obj \
@@ -263,6 +266,7 @@ IMAGEIO_DEC_OBJS = \
    $(DIROBJ)\imageio\jpegdec.obj \
    $(DIROBJ)\imageio\metadata.obj \
    $(DIROBJ)\imageio\pngdec.obj \
+    $(DIROBJ)\imageio\pnmdec.obj \
    $(DIROBJ)\imageio\tiffdec.obj \
    $(DIROBJ)\imageio\webpdec.obj \
    $(DIROBJ)\imageio\wicdec.obj \
@@ -279,6 +283,7 @@ EX_UTIL_OBJS = \
 ENC_OBJS = \
    $(DIROBJ)\enc\alpha_enc.obj \
    $(DIROBJ)\enc\analysis_enc.obj \
+    $(DIROBJ)\enc\backward_references_cost_enc.obj \
    $(DIROBJ)\enc\backward_references_enc.obj \
    $(DIROBJ)\enc\config_enc.obj \
    $(DIROBJ)\enc\cost_enc.obj \
@@ -344,7 +349,8 @@ all: ex
 OUT_EXAMPLES = $(DIRBIN)\cwebp.exe $(DIRBIN)\dwebp.exe
 EXTRA_EXAMPLES = $(DIRBIN)\vwebp.exe $(DIRBIN)\webpmux.exe \
                 $(DIRBIN)\img2webp.exe $(DIRBIN)\get_disto.exe \
-                 $(DIRBIN)\webp_quality.exe
+                 $(DIRBIN)\webp_quality.exe $(DIRBIN)\vwebp_sdl.exe \
+                 $(DIRBIN)\webpinfo.exe

 ex: $(OUT_LIBS) $(OUT_EXAMPLES)
 all: ex $(EXTRA_EXAMPLES)
@@ -366,6 +372,9 @@ $(DIRBIN)\gif2webp.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBPMUX)
 $(DIRBIN)\gif2webp.exe: $(LIBWEBP)
 $(DIRBIN)\vwebp.exe: $(DIROBJ)\examples\vwebp.obj $(EX_UTIL_OBJS)
 $(DIRBIN)\vwebp.exe: $(IMAGEIO_UTIL_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
+$(DIRBIN)\vwebp_sdl.exe: $(DIROBJ)\extras\vwebp_sdl.obj
+$(DIRBIN)\vwebp_sdl.exe: $(DIROBJ)\extras\webp_to_sdl.obj
+$(DIRBIN)\vwebp_sdl.exe: $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)
 $(DIRBIN)\webpmux.exe: $(DIROBJ)\examples\webpmux.obj $(LIBWEBPMUX)
 $(DIRBIN)\webpmux.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)
 $(DIRBIN)\img2webp.exe: $(DIROBJ)\examples\img2webp.obj $(LIBWEBPMUX)
@@ -373,10 +382,12 @@ $(DIRBIN)\img2webp.exe: $(IMAGEIO_DEC_OBJS)
 $(DIRBIN)\img2webp.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)
 $(DIRBIN)\get_disto.exe: $(DIROBJ)\extras\get_disto.obj
 $(DIRBIN)\get_disto.exe: $(IMAGEIO_DEC_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)
-
 $(DIRBIN)\webp_quality.exe: $(DIROBJ)\extras\webp_quality.obj
 $(DIRBIN)\webp_quality.exe: $(IMAGEIO_UTIL_OBJS)
 $(DIRBIN)\webp_quality.exe: $(EXTRAS_OBJS) $(LIBWEBP)
+$(DIRBIN)\webpinfo.exe: $(DIROBJ)\examples\webpinfo.obj
+$(DIRBIN)\webpinfo.exe: $(IMAGEIO_DEC_OBJS)
+$(DIRBIN)\webpinfo.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)

 $(OUT_EXAMPLES): $(EX_UTIL_OBJS) $(LIBWEBP)
 $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS): $(OUTPUT_DIRS)
--- a/15
+++ b/15
@@ -113,8 +113,8 @@ make install

 CMake:
 ------
-The support for CMake is minimal: it only helps you compile libwebp, cwebp and
-dwebp.
+With CMake, you can compile libwebp, cwebp, dwebp, gif2web, img2webp and the
+JS bindings.

 Prerequisites:
 A compiler (e.g., gcc with autotools) and CMake.
@@ -123,18 +123,25 @@ minimal build:
 $ sudo apt-get install build-essential cmake

 When building from git sources, you will need to run cmake to generate the
-configure script.
+makefiles.

 mkdir build && cd build && cmake ../
 make
 make install

-If you also want cwebp or dwebp, you will need to enable them through CMake:
+If you also want any of the executables, you will need to enable them through
+CMake, e.g.:

 cmake -DWEBP_BUILD_CWEBP=ON -DWEBP_BUILD_DWEBP=ON ../

 or through your favorite interface (like ccmake or cmake-qt-gui).

+Finally, once installed, you can also use WebP in your CMake project by doing:
+
+find_package(WebP)
+
+which will define the CMake variables WebP_INCLUDE_DIRS and WebP_LIBRARIES.
+
 Gradle:
 -------
 The support for Gradle is minimal: it only helps you compile libwebp, cwebp and
--- a/README.wasm
+++ b/README.wasm
@@ -0,0 +1,91 @@
+Description:
+============
+
+This file describes the compilation of libwebp using portable intrinsics /
+WebAssembly (wasm) to native targets using clang and CMake.
+
+Prerequisites:
+==============
+
+- cmake 2.8+
+
+- clang 3.9+ for portable intrinsics support; as wasm progresses a tip of tree
+  build may be necessary.
+
+Building:
+=========
+
+ - configure the project with CMake using:
+
+ $ mkdir -p build && \
+   cd build && \
+   cmake -DWEBP_BUILD_DWEBP=1 -DCMAKE_C_COMPILER=clang -DWEBP_ENABLE_WASM=1 ../
+
+ - compile dwebp using 'make'.
+
+ - Note this currently generates native executables only and is incompatible
+   with -DWEBP_BUILD_WEBP_JS.
+
+Build options:
+==============
+
+- platform specific multiply high (mulhi) implementation, disabled by default.
+  arm: -DCMAKE_C_FLAGS='-DENABLE_NEON_BUILTIN_MULHI_INT16X8 ...'
+  x86: -DCMAKE_C_FLAGS='-DENABLE_X86_BUILTIN_MULHI_INT16X8 ...'
+
+Cross compilation:
+==================
+
+ - arm toolchains can be obtained from:
+   http://www.linaro.org/downloads/
+
+ - the android ndk can be obtained from:
+   https://developer.android.com/ndk/downloads/index.html
+
+armv7:
+------
+
+Android:
+ $ ./android-ndk-r15b/build/tools/make_standalone_toolchain.py \
+   --arch arm --api 24 --stl gnustl --install-dir /opt/android-arm-24
+ $ mkdir -p build && cd build
+ $ cmake ../libwebp \
+   -DWEBP_BUILD_DWEBP=1 \
+   -DCMAKE_C_COMPILER=/opt/android-arm-24/bin/clang \
+   -DCMAKE_PREFIX_PATH=/opt/android-arm-24/sysroot/usr/lib \
+   -DCMAKE_C_FLAGS=-fPIE \
+   -DCMAKE_EXE_LINKER_FLAGS=-Wl,-pie \
+   -DCMAKE_BUILD_TYPE=Release \
+   -DWEBP_ENABLE_WASM=1
+
+Linux:
+ $ gcc_arm=/opt/gcc-arm; target=arm-linux-gnueabihf
+ $ mkdir -p build && cd build
+ $ cmake ../libwebp -DWEBP_BUILD_DWEBP=1 -DWEBP_ENABLE_WASM=1 \
+   -DCMAKE_C_COMPILER=clang \
+   -DCMAKE_C_FLAGS="--target=$target --gcc-toolchain=$gcc_arm --sysroot=$gcc_arm/$target/libc -march=armv7-a -mfpu=neon" \
+   -DCMAKE_PREFIX_PATH=$gcc_arm/$target/libc/usr
+
+aarch64 / arm64:
+----------------
+
+Android:
+ $ ./android-ndk-r15b/build/tools/make_standalone_toolchain.py \
+   --arch arm64 --api 24 --stl gnustl --install-dir /opt/android-arm64-24
+ $ mkdir -p build && cd build
+ $ cmake ../libwebp \
+   -DWEBP_BUILD_DWEBP=1 \
+   -DCMAKE_C_COMPILER=/opt/android-arm64-24/bin/clang \
+   -DCMAKE_PREFIX_PATH=/opt/android-arm64-24/sysroot/usr/lib \
+   -DCMAKE_C_FLAGS=-fPIE \
+   -DCMAKE_EXE_LINKER_FLAGS=-Wl,-pie \
+   -DCMAKE_BUILD_TYPE=Release \
+   -DWEBP_ENABLE_WASM=1
+
+Linux:
+ $ gcc_arm=/opt/gcc-aarch64; target=aarch64-linux-gnu
+ $ mkdir -p build && cd build
+ $ cmake ../libwebp -DWEBP_BUILD_DWEBP=1 -DWEBP_ENABLE_WASM=1 \
+   -DCMAKE_C_COMPILER=clang \
+   -DCMAKE_C_FLAGS="--target=$target --gcc-toolchain=$gcc_arm --sysroot=$gcc_arm/$target/libc" \
+   -DCMAKE_PREFIX_PATH=$gcc_arm/$target/libc/usr
--- a/README.webp_js
+++ b/README.webp_js
@@ -0,0 +1,80 @@
+     __   __ ____ ____ ____     __  ____
+    /  \\/  \  _ \  _ \  _ \   (__)/  __\
+    \       /  __/ _  \  __/   _)  \_   \
+     \__\__/_____/____/_/     /____/____/
+
+Description:
+============
+
+This file describes the compilation of libwebp into a JavaScript decoder
+using Emscripten and CMake.
+
+ - install the Emscripten SDK following the procedure described at:
+   https://kripken.github.io/emscripten-site/docs/getting_started/downloads.html
+   After installation, you should have some global variable positioned to the
+   location of the SDK. In particular, $EMSCRIPTEN should point to the
+   top-level directory containing Emscripten tools.
+
+ - make sure the file $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake is
+   accessible. This is the toolchain file used by CMake to invoke Emscripten.
+
+ - configure the project 'WEBP_JS' with CMake using:
+
+ cd webp_js && \
+ cmake -DWEBP_BUILD_WEBP_JS=ON \
+       -DEMSCRIPTEN_GENERATE_BITCODE_STATIC_LIBRARIES=1 \
+       -DCMAKE_TOOLCHAIN_FILE=$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake \
+       ../
+
+ - compile webp.js using 'make'.
+
+ - that's it! Upon completion, you should have the webp.js and
+   webp.js.mem files generated.
+
+ - Note this generates both webp_js and webp_wasm without any SIMD enabled due
+   to bugs with this toolchain associated with the SSE2 code.
+   -DWEBP_ENABLE_WASM is currently meant to generate native (x86, arm)
+   executables (dwebp, cwebp) and is incompatible with -DWEBP_BUILD_WEBP_JS.
+
+The callable JavaScript function is WebPToSDL(), which decodes a raw WebP
+bitstream into a canvas. See webp_js/index.html for a simple usage sample.
+
+Demo HTML page:
+===============
+
+   The HTML page webp_js/index.html requires an HTTP server to serve the WebP
+   image example. It's easy to just use Python for that.
+
+cd webp_js && python -m SimpleHTTPServer 8080
+
+and then navigate to http://localhost:8080 in your favorite browser.
+
+
+Web-Assembly (WASM) version:
+============================
+
+  CMakeLists.txt is configured to build the WASM version when using
+  the option WEBP_BUILD_WEBP_JS=ON. The compilation step will assemble
+  the files 'webp_wasm.js', 'webp_wasm.wasm' in the webp_js/ directory.
+  See webp_js/index_wasm.html for a simple demo page using the WASM version
+  of the library.
+
+  You will need a fairly recent version of Emscripten (at least 1.37.8) and of
+  your WASM-enabled browser to run this version. Consider it very experimental!
+
+Caveat:
+=======
+
+  - First decoding using the library is usually slower, due to just-in-time
+    compilation.
+
+  - Some versions of llvm produce the following compile error when SSE2 is
+    enabled.
+
+"Unsupported:   %516 = bitcast <8 x i16> %481 to i128
+ LLVM ERROR: BitCast Instruction not yet supported for integer types larger than 64 bits"
+
+    The corresponding Emscripten bug is at:
+    https://github.com/kripken/emscripten/issues/3788
+
+    Therefore, SSE2 optimization is currently disabled in CMakeLists.txt.
--- a/build.gradle
+++ b/build.gradle
@@ -74,9 +74,17 @@ model {
          cCompiler.args "-frename-registers -s"
        }
      }
+      // mips32 fails to build with clang from r14b
+      // https://bugs.chromium.org/p/webp/issues/detail?id=343
+      if (toolChain in Clang) {
+        if (getTargetPlatform() == "mips") {
+          cCompiler.args "-no-integrated-as"
+        }
+      }
      // Check for NEON usage.
      if (getTargetPlatform() == "arm" || getTargetPlatform() == "arm64") {
        NEON = "c.neon"
+        cCompiler.define "HAVE_CPU_FEATURES_H"
      } else {
        NEON = "c"
      }
@@ -148,6 +156,7 @@ model {
            include "yuv.c"
            include "yuv_mips32.c"
            include "yuv_mips_dsp_r2.c"
+            include "yuv_neon.$NEON"
            include "yuv_sse2.c"
            srcDir "src/utils"
            include "bit_reader_utils.c"
@@ -179,9 +188,12 @@ model {
            include "lossless_enc_neon.$NEON"
            include "lossless_enc_sse2.c"
            include "lossless_enc_sse41.c"
+            include "ssim.c"
+            include "ssim_sse2.c"
            srcDir "src/enc"
            include "alpha_enc.c"
            include "analysis_enc.c"
+            include "backward_references_cost_enc.c"
            include "backward_references_enc.c"
            include "config_enc.c"
            include "cost_enc.c"
@@ -288,6 +300,7 @@ model {
            include "jpegdec.c"
            include "metadata.c"
            include "pngdec.c"
+            include "pnmdec.c"
            include "tiffdec.c"
            include "webpdec.c"
          }
@@ -389,6 +402,24 @@ model {
        }
      }
    }
+
+    webpinfo_example(NativeExecutableSpec) {
+      binaries {
+        all {
+          lib library: "example_util", linkage: "static"
+          lib library: "imageio_util", linkage: "static"
+          lib library: "webp"
+        }
+      }
+      sources {
+        c {
+          source {
+            srcDir "./examples"
+            include "webpinfo.c"
+          }
+        }
+      }
+    }
  }
  tasks {
    // Task to test all possible configurations.
--- a/cmake/WebPConfig.cmake.in
+++ b/cmake/WebPConfig.cmake.in
@@ -0,0 +1,6 @@
+@PACKAGE_INIT@
+
+set(WebP_INCLUDE_DIRS "webp")
+set(WEBP_INCLUDE_DIRS ${WebP_INCLUDE_DIRS})
+set(WebP_LIBRARIES "@INSTALLED_LIBRARIES@")
+set(WEBP_LIBRARIES "${WebP_LIBRARIES}")
--- a/cmake/config.h.cmake
+++ b/cmake/config.h.cmake
@@ -65,7 +65,7 @@ endif()
 # Find the standard image libraries.
 set(WEBP_DEP_IMG_LIBRARIES)
 set(WEBP_DEP_IMG_INCLUDE_DIRS)
-foreach(I_LIB PNG JPEG TIFF GIF)
+foreach(I_LIB PNG JPEG TIFF)
  find_package(${I_LIB})
  set(WEBP_HAVE_${I_LIB} ${${I_LIB}_FOUND})
  if(${I_LIB}_FOUND)
@@ -74,6 +74,16 @@ foreach(I_LIB PNG JPEG TIFF GIF)
  endif()
 endforeach()

+# GIF detection, gifdec isn't part of the imageio lib.
+set(WEBP_DEP_GIF_LIBRARIES)
+set(WEBP_DEP_GIF_INCLUDE_DIRS)
+find_package(GIF)
+set(WEBP_HAVE_GIF ${GIF_FOUND})
+if(GIF_FOUND)
+  list(APPEND WEBP_DEP_GIF_LIBRARIES ${GIF_LIBRARIES})
+  list(APPEND WEBP_DEP_GIF_INCLUDE_DIRS ${GIF_INCLUDE_DIR})
+endif()
+
 ## Check for specific headers.
 include(CheckIncludeFiles)
 check_include_files("stdlib.h;stdarg.h;string.h;float.h" STDC_HEADERS)
--- a/cmake/cpu.cmake
+++ b/cmake/cpu.cmake
@@ -1,6 +1,11 @@
 ## Check for SIMD extensions.

-function(webp_check_compiler_flag WEBP_SIMD_FLAG)
+function(webp_check_compiler_flag WEBP_SIMD_FLAG ENABLE_SIMD)
+  if(NOT ENABLE_SIMD)
+    message(STATUS "Disabling ${WEBP_SIMD_FLAG} optimization.")
+    set(WEBP_HAVE_${WEBP_SIMD_FLAG} 0 PARENT_SCOPE)
+    return()
+  endif()
  unset(WEBP_HAVE_FLAG_${WEBP_SIMD_FLAG} CACHE)
  check_c_source_compiles("
      #include \"${CMAKE_CURRENT_LIST_DIR}/../src/dsp/dsp.h\"
@@ -56,11 +61,11 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
  # (especially on Android).
  unset(WEBP_HAVE_${WEBP_SIMD_FLAG} CACHE)
  set(CMAKE_REQUIRED_FLAGS)
-  webp_check_compiler_flag(${WEBP_SIMD_FLAG})
+  webp_check_compiler_flag(${WEBP_SIMD_FLAG} ${WEBP_ENABLE_SIMD})
  if(NOT WEBP_HAVE_${WEBP_SIMD_FLAG})
    list(GET SIMD_ENABLE_FLAGS ${I_SIMD} SIMD_COMPILE_FLAG)
    set(CMAKE_REQUIRED_FLAGS ${SIMD_COMPILE_FLAG})
-    webp_check_compiler_flag(${WEBP_SIMD_FLAG})
+    webp_check_compiler_flag(${WEBP_SIMD_FLAG} ${WEBP_ENABLE_SIMD})
  else()
    set(SIMD_COMPILE_FLAG " ")
  endif()
@@ -80,8 +85,11 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
    foreach(FILE ${SIMD_FILES})
      list(APPEND WEBP_SIMD_FILES_NOT_TO_INCLUDE ${FILE})
    endforeach()
-    # Explicitly disable SIMD.
-    if(SIMD_DISABLE_FLAGS)
+    # Explicitly disable SIMD. Avoid this with WASM to avoid an ICE with clang:
+    # https://bugs.chromium.org/p/webp/issues/detail?id=350
+    # WASM overrides the native SIMD so building it in is harmless aside from
+    # binary size.
+    if(NOT WEBP_ENABLE_WASM AND SIMD_DISABLE_FLAGS)
      list(GET SIMD_DISABLE_FLAGS ${I_SIMD} SIMD_COMPILE_FLAG)
      include(CheckCCompilerFlag)
      if(SIMD_COMPILE_FLAG)
@@ -111,3 +119,13 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
    endif()
  endif()
 endforeach()
+
+## Add *_wasm.c files if enabled.
+if(WEBP_ENABLE_WASM)
+  file(GLOB SIMD_FILES "${CMAKE_CURRENT_LIST_DIR}/../"
+    "src/dsp/*_wasm.c"
+  )
+  foreach(FILE ${SIMD_FILES})
+    list(APPEND WEBP_SIMD_FILES_TO_INCLUDE ${FILE})
+  endforeach()
+endif()
--- a/configure.ac
+++ b/configure.ac
@@ -67,6 +67,7 @@ AC_DEFUN([TEST_AND_ADD_CFLAGS],
          CFLAGS="$SAVED_CFLAGS"])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-fvisibility=hidden])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wall])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wconstant-conversion])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wdeclaration-after-statement])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wextra])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wfloat-conversion])
@@ -75,6 +76,7 @@ TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat -Wformat-security])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-declarations])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-prototypes])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wold-style-definition])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wparentheses-equality])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshadow])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshorten-64-to-32])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunreachable-code])
@@ -241,9 +243,13 @@ AS_IF([test "x$enable_neon" != "xno"], [
          NEON_FLAGS=""],
          [AC_DEFINE(WEBP_HAVE_NEON_RTCD, [1],
                     [Set to 1 if runtime detection of NEON is enabled])])])
-        ;;
-    esac
-    AC_SUBST([NEON_FLAGS])])
+
+      case "$host_os" in
+        *android*) AC_CHECK_HEADERS([cpu-features.h]) ;;
+      esac
+      ;;
+  esac
+  AC_SUBST([NEON_FLAGS])])

 dnl === CLEAR_LIBVARS([var_pfx])
 dnl ===   Clears <var_pfx>_{INCLUDES,LIBS}.
@@ -428,6 +434,44 @@ AS_IF([test "x$enable_gl" != "xno"], [
 ])
 AM_CONDITIONAL([BUILD_VWEBP], [test "$build_vwebp" = "yes"])

+dnl === check for SDL support ===
+
+AC_ARG_ENABLE([sdl],
+              AS_HELP_STRING([--disable-sdl],
+                             [Disable detection of SDL support
+                              @<:@default=auto@:>@]))
+AS_IF([test "x$enable_sdl" != "xno"], [
+  CLEAR_LIBVARS([SDL])
+  WITHLIB_OPTION([sdl], [SDL])
+
+  $sdl_header = "no";
+  LIBCHECK_PROLOGUE([SDL])
+  AC_CHECK_HEADER([SDL/SDL.h], [sdl_header="SDL_SDL.h"],
+                  [AC_CHECK_HEADER([SDL.h], [sdl_header="SDL.h"],
+                  [AC_MSG_WARN(SDL library not available - no sdl.h)])])
+  if test x"$sdl_header" != "xno" ; then
+    AC_CHECK_LIB(SDL, SDL_Init,
+                 [SDL_LIBS="-lSDL"
+                  SDL_INCLUDES="-DWEBP_HAVE_SDL"
+                  AC_DEFINE(WEBP_HAVE_SDL, [1],
+                            [Set to 1 if SDL library is installed])
+                  sdl_support=yes
+                 ],
+                 AC_MSG_WARN(Optional SDL library not found),
+                 [$MATH_LIBS]),
+    if test x"$sdl_header" == "xSDL.h" ; then
+      SDL_INCLUDES="$SDL_INCLUDES -DWEBP_HAVE_JUST_SDL_H"
+    fi
+  fi
+  LIBCHECK_EPILOGUE([SDL])
+
+  if test "$sdl_support" = "yes" ; then
+    build_vwebp_sdl=yes
+  fi
+])
+
+AM_CONDITIONAL([BUILD_VWEBP_SDL], [test "$build_vwebp_sdl" = "yes"])
+
 dnl === check for PNG support ===

 AC_ARG_ENABLE([png], AS_HELP_STRING([--disable-png],
@@ -561,6 +605,11 @@ if test "$enable_libwebpmux" = "yes"; then
 fi
 AM_CONDITIONAL([BUILD_IMG2WEBP], [test "${build_img2webp}" = "yes"])

+if test "$enable_libwebpmux" = "yes"; then
+  build_webpinfo=yes
+fi
+AM_CONDITIONAL([BUILD_WEBPINFO], [test "${build_webpinfo}" = "yes"])
+
 dnl === check for WIC support ===

 AC_ARG_ENABLE([wic],
@@ -718,4 +767,7 @@ gif2webp    : ${build_gif2webp-no}
 img2webp    : ${build_img2webp-no}
 webpmux     : ${enable_libwebpmux-no}
 vwebp       : ${build_vwebp-no}
+webpinfo    : ${build_webpinfo-no}
+SDL support : ${sdl_support-no}
+vwebp_sdl   : ${build_vwebp_sdl-no}
 ])
--- a/examples/Android.mk
+++ b/examples/Android.mk
@@ -80,3 +80,19 @@ LOCAL_STATIC_LIBRARIES := example_util imageio_util imagedec webpmux webp
 LOCAL_MODULE := img2webp_example

 include $(BUILD_EXECUTABLE)
+
+################################################################################
+# webpinfo
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+    webpinfo.c \
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
+LOCAL_STATIC_LIBRARIES := example_util imageio_util webp
+
+LOCAL_MODULE := webpinfo_example
+
+include $(BUILD_EXECUTABLE)
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -16,6 +16,9 @@ endif
 if BUILD_VWEBP
  bin_PROGRAMS += vwebp
 endif
+if BUILD_WEBPINFO
+  bin_PROGRAMS += webpinfo
+endif

 noinst_LTLIBRARIES = libexample_util.la

@@ -66,6 +69,11 @@ img2webp_LDADD += ../imageio/libimagedec.la
 img2webp_LDADD += ../src/mux/libwebpmux.la ../src/libwebp.la
 img2webp_LDADD += $(PNG_LIBS) $(JPEG_LIBS) $(TIFF_LIBS)

+webpinfo_SOURCES = webpinfo.c
+webpinfo_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+webpinfo_LDADD  = libexample_util.la ../imageio/libimageio_util.la
+webpinfo_LDADD += ../src/libwebp.la
+
 if BUILD_LIBWEBPDECODER
  anim_diff_LDADD += ../src/libwebpdecoder.la
  vwebp_LDADD += ../src/libwebpdecoder.la
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@@ -332,9 +332,8 @@ int main(int argc, const char *argv[]) {
      case BMP:
        output_buffer->colorspace = bitstream->has_alpha ? MODE_BGRA : MODE_BGR;
        break;
-      case TIFF:    // note: force pre-multiplied alpha
-        output_buffer->colorspace =
-            bitstream->has_alpha ? MODE_rgbA : MODE_RGB;
+      case TIFF:
+        output_buffer->colorspace = bitstream->has_alpha ? MODE_RGBA : MODE_RGB;
        break;
      case PGM:
      case RAW_YUV:
--- a/examples/gifdec.c
+++ b/examples/gifdec.c
@@ -28,11 +28,17 @@
 #define GIF_DISPOSE_SHIFT     2

 // from utils/utils.h
+#ifdef __cplusplus
+extern "C" {
+#endif
 extern void WebPCopyPlane(const uint8_t* src, int src_stride,
                          uint8_t* dst, int dst_stride,
                          int width, int height);
 extern void WebPCopyPixels(const WebPPicture* const src,
                           WebPPicture* const dst);
+#ifdef __cplusplus
+}
+#endif

 void GIFGetBackgroundColor(const ColorMapObject* const color_map,
                           int bgcolor_index, int transparent_index,
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@@ -378,13 +378,13 @@ static void HandleDisplay(void) {
    }
  }
  glPopMatrix();
-  glFlush();
+  glutSwapBuffers();
 }

 static void StartDisplay(void) {
  const int width = kParams.canvas_width;
  const int height = kParams.canvas_height;
-  glutInitDisplayMode(GLUT_RGBA);
+  glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
  glutInitWindowSize(width, height);
  glutCreateWindow("WebP viewer");
  glutDisplayFunc(HandleDisplay);
--- a/examples/webpinfo.c
+++ b/examples/webpinfo.c
--- a/extras/Makefile.am
+++ b/extras/Makefile.am
@@ -11,10 +11,14 @@ libwebpextras_la_CPPFLAGS = $(AM_CPPFLAGS)
 libwebpextras_la_LDFLAGS = -lm
 libwebpextras_la_LIBADD = ../src/libwebp.la

-noinst_PROGRAMS = get_disto webp_quality
+noinst_PROGRAMS =
+noinst_PROGRAMS += get_disto webp_quality
+if BUILD_VWEBP_SDL
+  noinst_PROGRAMS += vwebp_sdl
+endif

-get_disto_SOURCES = get_disto.c
-get_disto_CPPFLAGS  = $(AM_CPPFLAGS)
+get_disto_SOURCES  = get_disto.c
+get_disto_CPPFLAGS = $(AM_CPPFLAGS)
 get_disto_LDADD = ../imageio/libimageio_util.la ../imageio/libimagedec.la
 get_disto_LDADD += ../src/libwebp.la
 get_disto_LDADD += $(PNG_LIBS) $(JPEG_LIBS) $(TIFF_LIBS)
@@ -22,5 +26,11 @@ get_disto_LDADD += $(PNG_LIBS) $(JPEG_LIBS) $(TIFF_LIBS)
 webp_quality_SOURCES  = webp_quality.c
 webp_quality_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 webp_quality_LDADD  = ../imageio/libimageio_util.la
-webp_quality_LDADD += ./libwebpextras.la
+webp_quality_LDADD += libwebpextras.la
 webp_quality_LDADD += ../src/libwebp.la
+
+vwebp_sdl_SOURCES  = vwebp_sdl.c webp_to_sdl.c webp_to_sdl.h
+vwebp_sdl_CPPFLAGS = $(AM_CPPFLAGS) $(SDL_INCLUDES)
+vwebp_sdl_LDADD = ../imageio/libimageio_util.la
+vwebp_sdl_LDADD += ../src/libwebp.la
+vwebp_sdl_LDADD += $(SDL_LIBS)
--- a/extras/get_disto.c
+++ b/extras/get_disto.c
@@ -278,7 +278,7 @@ int main(int argc, const char *argv[]) {
    goto End;
  }
  size1 = ReadPicture(name1, &pic1, 1);
-  size2 = ReadPicture(name1, &pic2, 1);
+  size2 = ReadPicture(name2, &pic2, 1);
  if (size1 == 0 || size2 == 0) goto End;

  if (!keep_alpha) {
--- a/extras/vwebp_sdl.c
+++ b/extras/vwebp_sdl.c
@@ -0,0 +1,96 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Simple SDL-based WebP file viewer.
+// Does not support animation, just static images.
+//
+// Press 'q' to exit.
+//
+// Author: James Zern (jzern@google.com)
+
+#include <stdio.h>
+
+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
+#if defined(WEBP_HAVE_SDL)
+
+#include "webp_to_sdl.h"
+#include "webp/decode.h"
+#include "../imageio/imageio_util.h"
+
+#if defined(WEBP_HAVE_JUST_SDL_H)
+#include <SDL.h>
+#else
+#include <SDL/SDL.h>
+#endif
+
+static void ProcessEvents(void) {
+  int done = 0;
+  SDL_Event event;
+  while (!done && SDL_WaitEvent(&event)) {
+    switch (event.type) {
+      case SDL_KEYUP:
+        switch (event.key.keysym.sym) {
+          case SDLK_q: done = 1; break;
+          default: break;
+        }
+        break;
+      default: break;
+    }
+  }
+}
+
+int main(int argc, char* argv[]) {
+  int c;
+  int ok = 0;
+  for (c = 1; c < argc; ++c) {
+    const char* file = NULL;
+    const uint8_t* webp = NULL;
+    size_t webp_size = 0;
+    if (!strcmp(argv[c], "-h")) {
+      printf("Usage: %s [-h] image.webp [more_files.webp...]\n", argv[0]);
+      return 0;
+    } else {
+      file = argv[c];
+    }
+    if (file == NULL) continue;
+    if (!ImgIoUtilReadFile(file, &webp, &webp_size)) {
+      fprintf(stderr, "Error opening file: %s\n", file);
+      goto Error;
+    }
+    if (webp_size != (size_t)(int)webp_size) {
+      fprintf(stderr, "File too large.\n");
+      goto Error;
+    }
+    ok = WebpToSDL((const char*)webp, (int)webp_size);
+    free((void*)webp);
+    if (!ok) {
+      fprintf(stderr, "Error decoding file %s\n", file);
+      goto Error;
+    }
+    ProcessEvents();
+  }
+  ok = 1;
+
+ Error:
+  SDL_Quit();
+  return ok ? 0 : 1;
+}
+
+#else  // !WEBP_HAVE_SDL
+
+int main(int argc, const char *argv[]) {
+  fprintf(stderr, "SDL support not enabled in %s.\n", argv[0]);
+  (void)argc;
+  return 0;
+}
+
+#endif
--- a/extras/webp_to_sdl.c
+++ b/extras/webp_to_sdl.c
@@ -0,0 +1,105 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Simple WebP-to-SDL wrapper. Useful for emscripten.
+//
+// Author: James Zern (jzern@google.com)
+
+#ifdef HAVE_CONFIG_H
+#include "webp/config.h"
+#endif
+
+#if defined(WEBP_HAVE_SDL)
+
+#include "webp_to_sdl.h"
+
+#include <stdio.h>
+#include "webp/decode.h"
+
+#if defined(WEBP_HAVE_JUST_SDL_H)
+#include <SDL.h>
+#else
+#include <SDL/SDL.h>
+#endif
+
+int WebpToSDL(const char* data, unsigned int data_size) {
+  int ok = 0;
+  VP8StatusCode status;
+  WebPDecoderConfig config;
+  WebPBitstreamFeatures* const input = &config.input;
+  WebPDecBuffer* const output = &config.output;
+  SDL_Surface* screen = NULL;
+  SDL_Surface* surface = NULL;
+
+  if (!WebPInitDecoderConfig(&config)) {
+    fprintf(stderr, "Library version mismatch!\n");
+    return 1;
+  }
+
+  SDL_Init(SDL_INIT_VIDEO);
+
+  status = WebPGetFeatures((uint8_t*)data, (size_t)data_size, &config.input);
+  if (status != VP8_STATUS_OK) goto Error;
+
+  screen = SDL_SetVideoMode(input->width, input->height, 32, SDL_SWSURFACE);
+  if (screen == NULL) {
+    fprintf(stderr, "Unable to set video mode (32bpp %dx%d)!\n",
+            input->width, input->height);
+    goto Error;
+  }
+
+  surface = SDL_CreateRGBSurface(SDL_SWSURFACE,
+                                 input->width, input->height, 32,
+                                 0x000000ffu,   // R mask
+                                 0x0000ff00u,   // G mask
+                                 0x00ff0000u,   // B mask
+                                 0xff000000u);  // A mask
+
+  if (surface == NULL) {
+    fprintf(stderr, "Unable to create %dx%d RGBA surface!\n",
+            input->width, input->height);
+    goto Error;
+  }
+  if (SDL_MUSTLOCK(surface)) SDL_LockSurface(surface);
+
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+  output->colorspace = MODE_BGRA;
+#else
+  output->colorspace = MODE_RGBA;
+#endif
+  output->width  = surface->w;
+  output->height = surface->h;
+  output->u.RGBA.rgba   = surface->pixels;
+  output->u.RGBA.stride = surface->pitch;
+  output->u.RGBA.size   = surface->pitch * surface->h;
+  output->is_external_memory = 1;
+
+  status = WebPDecode((const uint8_t*)data, (size_t)data_size, &config);
+  if (status != VP8_STATUS_OK) {
+    fprintf(stderr, "Error decoding image (%d)\n", status);
+    goto Error;
+  }
+
+  if (SDL_MUSTLOCK(surface)) SDL_UnlockSurface(surface);
+  if (SDL_BlitSurface(surface, NULL, screen, NULL) ||
+      SDL_Flip(screen)) {
+    goto Error;
+  }
+
+  ok = 1;
+
+ Error:
+  SDL_FreeSurface(surface);
+  SDL_FreeSurface(screen);
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+
+#endif  // WEBP_HAVE_SDL
--- a/extras/webp_to_sdl.h
+++ b/extras/webp_to_sdl.h
@@ -0,0 +1,22 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Simple WebP-to-SDL wrapper. Useful for emscripten.
+//
+// Author: James Zern (jzern@google.com)
+
+#ifndef WEBP_EXTRAS_WEBP_TO_SDL_H_
+#define WEBP_EXTRAS_WEBP_TO_SDL_H_
+
+// Exports the method WebpToSDL(const char* data, int data_size) which decodes
+// a WebP bitstream into an RGBA SDL surface.
+// Return false on failure.
+extern int WebpToSDL(const char* data, unsigned int data_size);
+
+#endif  // WEBP_EXTRAS_WEBP_TO_SDL_H_
--- a/imageio/Android.mk
+++ b/imageio/Android.mk
@@ -25,6 +25,7 @@ LOCAL_SRC_FILES := \
    jpegdec.c \
    metadata.c \
    pngdec.c \
+    pnmdec.c \
    tiffdec.c \
    webpdec.c \

--- a/imageio/Makefile.am
+++ b/imageio/Makefile.am
@@ -11,6 +11,7 @@ libimagedec_la_SOURCES  = image_dec.c image_dec.h
 libimagedec_la_SOURCES += jpegdec.c jpegdec.h
 libimagedec_la_SOURCES += metadata.c metadata.h
 libimagedec_la_SOURCES += pngdec.c pngdec.h
+libimagedec_la_SOURCES += pnmdec.c pnmdec.h
 libimagedec_la_SOURCES += tiffdec.c tiffdec.h
 libimagedec_la_SOURCES += webpdec.c webpdec.h
 libimagedec_la_SOURCES += wicdec.c wicdec.h
--- a/imageio/image_dec.c
+++ b/imageio/image_dec.c
@@ -29,6 +29,10 @@ WebPInputFileFormat WebPGuessImageType(const uint8_t* const data,
      format = WEBP_TIFF_FORMAT;
    } else if (magic1 == 0x52494646 && magic2 == 0x57454250) {
      format = WEBP_WEBP_FORMAT;
+    } else if (((magic1 >> 24) & 0xff) == 'P') {
+      const int type = (magic1 >> 16) & 0xff;
+      // we only support 'P5 -> P7' for now.
+      if (type >= '5' && type <= '7') format = WEBP_PNM_FORMAT;
    }
  }
  return format;
@@ -51,6 +55,7 @@ WebPImageReader WebPGetImageReader(WebPInputFileFormat format) {
    case WEBP_JPEG_FORMAT: return ReadJPEG;
    case WEBP_TIFF_FORMAT: return ReadTIFF;
    case WEBP_WEBP_FORMAT: return ReadWebP;
+    case WEBP_PNM_FORMAT: return ReadPNM;
    default: return FailReader;
  }
 }
--- a/imageio/image_dec.h
+++ b/imageio/image_dec.h
@@ -23,6 +23,7 @@
 #include "./metadata.h"
 #include "./jpegdec.h"
 #include "./pngdec.h"
+#include "./pnmdec.h"
 #include "./tiffdec.h"
 #include "./webpdec.h"
 #include "./wicdec.h"
@@ -36,6 +37,7 @@ typedef enum {
  WEBP_JPEG_FORMAT,
  WEBP_TIFF_FORMAT,
  WEBP_WEBP_FORMAT,
+  WEBP_PNM_FORMAT,
  WEBP_UNSUPPORTED_FORMAT
 } WebPInputFileFormat;

--- a/imageio/image_enc.c
+++ b/imageio/image_enc.c
@@ -361,6 +361,8 @@ int WebPWriteTIFF(FILE* fout, const WebPDecBuffer* const buffer) {
  const uint8_t* rgba = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const uint8_t bytes_per_px = has_alpha ? 4 : 3;
+  const uint8_t assoc_alpha =
+      WebPIsPremultipliedMode(buffer->colorspace) ? 1 : 2;
  // For non-alpha case, we omit tag 0x152 (ExtraSamples).
  const uint8_t num_ifd_entries = has_alpha ? NUM_IFD_ENTRIES
                                            : NUM_IFD_ENTRIES - 1;
@@ -388,7 +390,8 @@ int WebPWriteTIFF(FILE* fout, const WebPDecBuffer* const buffer) {
        EXTRA_DATA_OFFSET + 8, 0, 0, 0,
    0x1c, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    // 154: PlanarConfiguration
    0x28, 0x01, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0,    // 166: ResolutionUnit (inch)
-    0x52, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    // 178: ExtraSamples: rgbA
+    0x52, 0x01, 3, 0, 1, 0, 0, 0,
+        assoc_alpha, 0, 0, 0,                    // 178: ExtraSamples: rgbA/RGBA
    0, 0, 0, 0,                                  // 190: IFD terminator
    // EXTRA_DATA_OFFSET:
    8, 0, 8, 0, 8, 0, 8, 0,      // BitsPerSample
--- a/imageio/imageio_util.c
+++ b/imageio/imageio_util.c
@@ -112,7 +112,7 @@ int ImgIoUtilWriteFile(const char* const file_name,
  if (data == NULL) {
    return 0;
  }
-  out = to_stdout ? stdout : fopen(file_name, "wb");
+  out = to_stdout ? ImgIoUtilSetBinaryMode(stdout) : fopen(file_name, "wb");
  if (out == NULL) {
    fprintf(stderr, "Error! Cannot open output file '%s'\n", file_name);
    return 0;
--- a/imageio/pnmdec.c
+++ b/imageio/pnmdec.c
@@ -0,0 +1,252 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// (limited) PNM decoder
+
+#include "./pnmdec.h"
+
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "webp/encode.h"
+#include "./imageio_util.h"
+
+typedef enum {
+  WIDTH_FLAG      = 1 << 0,
+  HEIGHT_FLAG     = 1 << 1,
+  DEPTH_FLAG      = 1 << 2,
+  MAXVAL_FLAG     = 1 << 3,
+  TUPLE_FLAG      = 1 << 4,
+  ALL_NEEDED_FLAGS = 0x1f
+} PNMFlags;
+
+typedef struct {
+  const uint8_t* data;
+  size_t data_size;
+  int width, height;
+  int bytes_per_px;   // 1, 3, 4
+  int depth;
+  int max_value;
+  int type;           // 5, 6 or 7
+  int seen_flags;
+} PNMInfo;
+
+// -----------------------------------------------------------------------------
+// PNM decoding
+
+#define MAX_LINE_SIZE 1024
+static const size_t kMinPNMHeaderSize = 3;
+
+static size_t ReadLine(const uint8_t* const data, size_t off, size_t data_size,
+                       char out[MAX_LINE_SIZE + 1], size_t* const out_size) {
+  size_t i = 0;
+  *out_size = 0;
+ redo:
+  for (i = 0; i < MAX_LINE_SIZE && off < data_size; ++i) {
+    out[i] = data[off++];
+    if (out[i] == '\n') break;
+  }
+  if (off < data_size) {
+    if (i == 0) goto redo;         // empty line
+    if (out[0] == '#') goto redo;  // skip comment
+  }
+  out[i] = 0;   // safety sentinel
+  *out_size = i;
+  return off;
+}
+
+static size_t FlagError(const char flag[]) {
+  fprintf(stderr, "PAM header error: flags '%s' already seen.\n", flag);
+  return 0;
+}
+
+// inspired from http://netpbm.sourceforge.net/doc/pam.html
+static size_t ReadPAMFields(PNMInfo* const info, size_t off) {
+  char out[MAX_LINE_SIZE + 1];
+  size_t out_size;
+  int tmp;
+  assert(info != NULL);
+  while (1) {
+    off = ReadLine(info->data, off, info->data_size, out, &out_size);
+    if (off == 0) return 0;
+    if (sscanf(out, "WIDTH %d", &tmp) == 1) {
+      if (info->seen_flags & WIDTH_FLAG) return FlagError("WIDTH");
+      info->seen_flags |= WIDTH_FLAG;
+      info->width = tmp;
+    } else if (sscanf(out, "HEIGHT %d", &tmp) == 1) {
+      if (info->seen_flags & HEIGHT_FLAG) return FlagError("HEIGHT");
+      info->seen_flags |= HEIGHT_FLAG;
+      info->height = tmp;
+    } else if (sscanf(out, "DEPTH %d", &tmp) == 1) {
+      if (info->seen_flags & DEPTH_FLAG) return FlagError("DEPTH");
+      info->seen_flags |= DEPTH_FLAG;
+      info->depth = tmp;
+    } else if (sscanf(out, "MAXVAL %d", &tmp) == 1) {
+      if (info->seen_flags & MAXVAL_FLAG) return FlagError("MAXVAL");
+      info->seen_flags |= MAXVAL_FLAG;
+      info->max_value = tmp;
+    } else if (!strcmp(out, "TUPLTYPE RGB_ALPHA")) {
+      info->bytes_per_px = 4;
+      info->seen_flags |= TUPLE_FLAG;
+    } else if (!strcmp(out, "TUPLTYPE RGB")) {
+      info->bytes_per_px = 3;
+      info->seen_flags |= TUPLE_FLAG;
+    } else if (!strcmp(out, "TUPLTYPE GRAYSCALE")) {
+      info->bytes_per_px = 1;
+      info->seen_flags |= TUPLE_FLAG;
+    } else if (!strcmp(out, "ENDHDR")) {
+      break;
+    } else {
+      static const char kEllipsis[] = " ...";
+      int i;
+      if (out_size > 20) sprintf(out + 20 - strlen(kEllipsis), kEllipsis);
+      for (i = 0; i < (int)strlen(out); ++i) {
+        if (!isprint(out[i])) out[i] = ' ';
+      }
+      fprintf(stderr, "PAM header error: unrecognized entry [%s]\n", out);
+      return 0;
+    }
+  }
+  if (!(info->seen_flags & TUPLE_FLAG)) {
+    info->seen_flags |= TUPLE_FLAG;
+    info->bytes_per_px = info->depth * (info->max_value > 255 ? 2 : 1);
+  }
+  if (info->seen_flags != ALL_NEEDED_FLAGS) {
+    fprintf(stderr, "PAM: incomplete header.\n");
+    return 0;
+  }
+  return off;
+}
+
+static size_t ReadHeader(PNMInfo* const info) {
+  size_t off = 0;
+  char out[MAX_LINE_SIZE + 1];
+  size_t out_size;
+  if (info == NULL) return 0;
+  if (info->data == NULL || info->data_size < kMinPNMHeaderSize) return 0;
+
+  info->width = info->height = 0;
+  info->type = -1;
+  info->seen_flags = 0;
+  info->bytes_per_px = 0;
+  info->depth = 0;
+  info->max_value = 0;
+
+  off = ReadLine(info->data, off, info->data_size, out, &out_size);
+  if (off == 0 || sscanf(out, "P%d", &info->type) != 1) return 0;
+  if (info->type == 7) {
+    off = ReadPAMFields(info, off);
+  } else {
+    off = ReadLine(info->data, off, info->data_size, out, &out_size);
+    if (off == 0 || sscanf(out, "%d %d", &info->width, &info->height) != 2) {
+      return 0;
+    }
+    off = ReadLine(info->data, off, info->data_size, out, &out_size);
+    if (off == 0 || sscanf(out, "%d", &info->max_value) != 1) return 0;
+
+    // finish initializing missing fields
+    info->depth = (info->type == 5) ? 1 : 3;
+    info->bytes_per_px = info->depth * (info->max_value > 255 ? 2 : 1);
+  }
+  // perform some basic numerical validation
+  if (info->width <= 0 || info->height <= 0 ||
+      info->type <= 0 || info->type >= 9 ||
+      info->depth <= 0 || info->depth > 4 ||
+      info->bytes_per_px < info->depth ||
+      info->max_value <= 0 || info->max_value >= 65536) {
+    return 0;
+  }
+  return off;
+}
+
+int ReadPNM(const uint8_t* const data, size_t data_size,
+            WebPPicture* const pic, int keep_alpha,
+            struct Metadata* const metadata) {
+  int ok = 0;
+  int i, j;
+  uint64_t stride, pixel_bytes;
+  uint8_t* rgb = NULL, *tmp_rgb;
+  size_t offset;
+  PNMInfo info;
+
+  info.data = data;
+  info.data_size = data_size;
+  offset = ReadHeader(&info);
+  if (offset == 0) {
+    fprintf(stderr, "Error parsing PNM header.\n");
+    goto End;
+  }
+
+  if (info.type < 5 || info.type > 7) {
+    fprintf(stderr, "Unsupported P%d PNM format.\n", info.type);
+    goto End;
+  }
+
+  // Some basic validations.
+  if (pic == NULL) goto End;
+  if (info.width > WEBP_MAX_DIMENSION || info.height > WEBP_MAX_DIMENSION) {
+    fprintf(stderr, "Invalid %dx%d dimension for PNM\n",
+                    info.width, info.height);
+    goto End;
+  }
+
+  pixel_bytes = (uint64_t)info.width * info.height * info.bytes_per_px;
+  if (data_size < offset + pixel_bytes) {
+    fprintf(stderr, "Truncated PNM file (P%d).\n", info.type);
+    goto End;
+  }
+  stride =
+      (uint64_t)(info.bytes_per_px < 3 ? 3 : info.bytes_per_px) * info.width;
+  if (stride != (size_t)stride ||
+      !ImgIoUtilCheckSizeArgumentsOverflow(stride, info.height)) {
+    goto End;
+  }
+
+  rgb = (uint8_t*)malloc((size_t)stride * info.height);
+  if (rgb == NULL) goto End;
+
+  // Convert input
+  tmp_rgb = rgb;
+  for (j = 0; j < info.height; ++j) {
+    assert(offset + info.bytes_per_px * info.width <= data_size);
+    if (info.depth == 1) {
+      // convert grayscale -> RGB
+      for (i = 0; i < info.width; ++i) {
+        const uint8_t v = data[offset + i];
+        tmp_rgb[3 * i + 0] = tmp_rgb[3 * i + 1] = tmp_rgb[3 * i + 2] = v;
+      }
+    } else if (info.depth == 3) {   // RGB
+      memcpy(tmp_rgb, data + offset, 3 * info.width * sizeof(*data));
+    } else if (info.depth == 4) {   // RGBA
+      memcpy(tmp_rgb, data + offset, 4 * info.width * sizeof(*data));
+    }
+    offset += info.bytes_per_px * info.width;
+    tmp_rgb += stride;
+  }
+
+  // WebP conversion.
+  pic->width = info.width;
+  pic->height = info.height;
+  ok = (info.depth == 4) ? WebPPictureImportRGBA(pic, rgb, (int)stride)
+                         : WebPPictureImportRGB(pic, rgb, (int)stride);
+  if (!ok) goto End;
+
+  ok = 1;
+ End:
+  free((void*)rgb);
+
+  (void)metadata;
+  (void)keep_alpha;
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
--- a/imageio/pnmdec.h
+++ b/imageio/pnmdec.h
@@ -0,0 +1,37 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// partial PNM format decoder (ppm/pgm)
+
+#ifndef WEBP_IMAGEIO_PNMDEC_H_
+#define WEBP_IMAGEIO_PNMDEC_H_
+
+#include "webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct Metadata;
+struct WebPPicture;
+
+// Reads a PNM file from 'data', returning the decoded output in 'pic'.
+// The output is RGB or YUV depending on pic->use_argb value.
+// Returns true on success.
+// 'metadata' has no effect, but is kept for coherence with other signatures
+// for image readers.
+int ReadPNM(const uint8_t* const data, size_t data_size,
+            struct WebPPicture* const pic, int keep_alpha,
+            struct Metadata* const metadata);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_IMAGEIO_PNMDEC_H_
--- a/imageio/tiffdec.c
+++ b/imageio/tiffdec.c
@@ -15,6 +15,7 @@
 #include "webp/config.h"
 #endif

+#include <limits.h>
 #include <stdio.h>
 #include <string.h>

@@ -107,7 +108,7 @@ static void MyUnmapFile(thandle_t opaque, void* base, toff_t size) {
 static tsize_t MyRead(thandle_t opaque, void* dst, tsize_t size) {
  MyData* const my_data = (MyData*)opaque;
  if (my_data->pos + size > my_data->size) {
-    size = my_data->size - my_data->pos;
+    size = (tsize_t)(my_data->size - my_data->pos);
  }
  if (size > 0) {
    memcpy(dst, my_data->data + my_data->pos, size);
@@ -116,18 +117,55 @@ static tsize_t MyRead(thandle_t opaque, void* dst, tsize_t size) {
  return size;
 }

+// Unmultiply Argb data. Taken from dsp/alpha_processing
+// (we don't want to force a dependency to a libdspdec library).
+#define MFIX 24    // 24bit fixed-point arithmetic
+#define HALF ((1u << MFIX) >> 1)
+#define KINV_255 ((1u << MFIX) / 255u)
+
+static uint32_t Unmult(uint8_t x, uint32_t mult) {
+  const uint32_t v = (x * mult + HALF) >> MFIX;
+  return (v > 255u) ? 255u : v;
+}
+
+static WEBP_INLINE uint32_t GetScale(uint32_t a) {
+  return (255u << MFIX) / a;
+}
+
+static void MultARGBRow(uint8_t* ptr, int width) {
+  int x;
+  for (x = 0; x < width; ++x, ptr += 4) {
+    const uint32_t alpha = ptr[3];
+    if (alpha < 255) {
+      if (alpha == 0) {   // alpha == 0
+        ptr[0] = ptr[1] = ptr[2] = 0;
+      } else {
+        const uint32_t scale = GetScale(alpha);
+        ptr[0] = Unmult(ptr[0], scale);
+        ptr[1] = Unmult(ptr[1], scale);
+        ptr[2] = Unmult(ptr[2], scale);
+      }
+    }
+  }
+}
+
 int ReadTIFF(const uint8_t* const data, size_t data_size,
             WebPPicture* const pic, int keep_alpha,
             Metadata* const metadata) {
  MyData my_data = { data, (toff_t)data_size, 0 };
  TIFF* tif;
-  uint32 width, height;
-  uint32* raster;
+  uint32_t width, height;
+  uint16_t samples_per_px = 0;
+  uint16_t extra_samples = 0;
+  uint16_t* extra_samples_ptr = NULL;
+  uint32_t* raster;
  int64_t alloc_size;
  int ok = 0;
  tdir_t dircount;

-  if (data == NULL || data_size == 0 || pic == NULL) return 0;
+  if (data == NULL || data_size == 0 || data_size > INT_MAX || pic == NULL) {
+    return 0;
+  }

  tif = TIFFClientOpen("Memory", "r", &my_data,
                       MyRead, MyRead, MySeek, MyClose,
@@ -143,17 +181,27 @@ int ReadTIFF(const uint8_t* const data, size_t data_size,
                    "Only the first will be used, %d will be ignored.\n",
                    dircount - 1);
  }
+  if (!TIFFGetFieldDefaulted(tif, TIFFTAG_SAMPLESPERPIXEL, &samples_per_px)) {
+    fprintf(stderr, "Error! Cannot retrieve TIFF samples-per-pixel info.\n");
+    goto End;
+  }
+  if (samples_per_px < 3 || samples_per_px > 4) goto End;  // not supported

  if (!(TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width) &&
        TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height))) {
    fprintf(stderr, "Error! Cannot retrieve TIFF image dimensions.\n");
    goto End;
  }
-
  if (!ImgIoUtilCheckSizeArgumentsOverflow((uint64_t)width * height,
                                           sizeof(*raster))) {
    goto End;
  }
+  if (samples_per_px > 3 && !TIFFGetField(tif, TIFFTAG_EXTRASAMPLES,
+                                          &extra_samples, &extra_samples_ptr)) {
+    fprintf(stderr, "Error! Cannot retrieve TIFF ExtraSamples info.\n");
+    goto End;
+  }
+
  // _Tiffmalloc uses a signed type for size.
  alloc_size = (int64_t)((uint64_t)width * height * sizeof(*raster));
  if (alloc_size < 0 || alloc_size != (tsize_t)alloc_size) goto End;
@@ -169,6 +217,16 @@ int ReadTIFF(const uint8_t* const data, size_t data_size,
 #ifdef WORDS_BIGENDIAN
      TIFFSwabArrayOfLong(raster, width * height);
 #endif
+      // if we have an alpha channel, we must un-multiply from rgbA to RGBA
+      if (extra_samples == 1 && extra_samples_ptr != NULL &&
+          extra_samples_ptr[0] == EXTRASAMPLE_ASSOCALPHA) {
+        uint32_t y;
+        uint8_t* tmp = (uint8_t*)raster;
+        for (y = 0; y < height; ++y) {
+          MultARGBRow(tmp, width);
+          tmp += stride;
+        }
+      }
      ok = keep_alpha
         ? WebPPictureImportRGBA(pic, (const uint8_t*)raster, stride)
         : WebPPictureImportRGBX(pic, (const uint8_t*)raster, stride);
--- a/imageio/webpdec.c
+++ b/imageio/webpdec.c
@@ -138,46 +138,53 @@ int ReadWebP(const uint8_t* const data, size_t data_size,
    PrintWebPError("input data", status);
    return 0;
  }
-  {
+
+  do {
    const int has_alpha = keep_alpha && bitstream->has_alpha;
+    pic->width = bitstream->width;
+    pic->height = bitstream->height;
+    if (!pic->use_argb) pic->colorspace = has_alpha ? WEBP_YUV420A
+                                                    : WEBP_YUV420;
+    ok = WebPPictureAlloc(pic);
+    if (!ok) {
+      status = VP8_STATUS_OUT_OF_MEMORY;
+      break;
+    }
    if (pic->use_argb) {
-      output_buffer->colorspace = has_alpha ? MODE_RGBA : MODE_RGB;
+      output_buffer->colorspace = MODE_BGRA;
+      output_buffer->u.RGBA.rgba = (uint8_t*)pic->argb;
+      output_buffer->u.RGBA.stride = pic->argb_stride * sizeof(uint32_t);
+      output_buffer->u.RGBA.size = output_buffer->u.RGBA.stride * pic->height;
    } else {
      output_buffer->colorspace = has_alpha ? MODE_YUVA : MODE_YUV;
+      output_buffer->u.YUVA.y = pic->y;
+      output_buffer->u.YUVA.u = pic->u;
+      output_buffer->u.YUVA.v = pic->v;
+      output_buffer->u.YUVA.a = has_alpha ? pic->a : NULL;
+      output_buffer->u.YUVA.y_stride = pic->y_stride;
+      output_buffer->u.YUVA.u_stride = pic->uv_stride;
+      output_buffer->u.YUVA.v_stride = pic->uv_stride;
+      output_buffer->u.YUVA.a_stride = has_alpha ? pic->a_stride : 0;
+      output_buffer->u.YUVA.y_size = pic->height * pic->y_stride;
+      output_buffer->u.YUVA.u_size = (pic->height + 1) / 2 * pic->uv_stride;
+      output_buffer->u.YUVA.v_size = (pic->height + 1) / 2 * pic->uv_stride;
+      output_buffer->u.YUVA.a_size = pic->height * pic->a_stride;
    }
+    output_buffer->is_external_memory = 1;

    status = DecodeWebP(data, data_size, &config);
-    if (status == VP8_STATUS_OK) {
-      pic->width = output_buffer->width;
-      pic->height = output_buffer->height;
-      if (pic->use_argb) {
-        const uint8_t* const rgba = output_buffer->u.RGBA.rgba;
-        const int stride = output_buffer->u.RGBA.stride;
-        ok = has_alpha ? WebPPictureImportRGBA(pic, rgba, stride)
-                       : WebPPictureImportRGB(pic, rgba, stride);
-      } else {
-        pic->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
-        ok = WebPPictureAlloc(pic);
-        if (!ok) {
-          status = VP8_STATUS_OUT_OF_MEMORY;
-        } else {
-          const WebPYUVABuffer* const yuva = &output_buffer->u.YUVA;
-          const int uv_width = (pic->width + 1) >> 1;
-          const int uv_height = (pic->height + 1) >> 1;
-          ImgIoUtilCopyPlane(yuva->y, yuva->y_stride,
-                             pic->y, pic->y_stride, pic->width, pic->height);
-          ImgIoUtilCopyPlane(yuva->u, yuva->u_stride,
-                             pic->u, pic->uv_stride, uv_width, uv_height);
-          ImgIoUtilCopyPlane(yuva->v, yuva->v_stride,
-                             pic->v, pic->uv_stride, uv_width, uv_height);
-          if (has_alpha) {
-            ImgIoUtilCopyPlane(yuva->a, yuva->a_stride,
-                               pic->a, pic->a_stride, pic->width, pic->height);
-          }
-        }
+    ok = (status == VP8_STATUS_OK);
+    if (!ok) WebPPictureFree(pic);
+    if (ok && !keep_alpha && pic->use_argb) {
+      // Need to wipe out the alpha value, as requested.
+      int x, y;
+      uint32_t* argb = pic->argb;
+      for (y = 0; y < pic->height; ++y) {
+        for (x = 0; x < pic->width; ++x) argb[x] |= 0xff000000u;
+        argb += pic->argb_stride;
      }
    }
-  }
+  } while (0);   // <- so we can 'break' out of the loop

  if (status != VP8_STATUS_OK) {
    PrintWebPError("input data", status);
--- a/imageio/webpdec.h
+++ b/imageio/webpdec.h
@@ -51,7 +51,7 @@ VP8StatusCode DecodeWebPIncremental(

 //------------------------------------------------------------------------------

-// Reads a WebP from 'in_file', returning the decoded output in 'pic'.
+// Decodes a WebP contained in 'data', returning the decoded output in 'pic'.
 // Output is RGBA or YUVA, depending on pic->use_argb value.
 // If 'keep_alpha' is true and the WebP has an alpha channel, the output is RGBA
 // or YUVA. Otherwise, alpha channel is dropped and output is RGB or YUV.
--- a/makefile.unix
+++ b/makefile.unix
@@ -29,6 +29,8 @@ ifeq ($(strip $(shell uname)), Darwin)
  EXTRA_LIBS  += -L/opt/local/lib
  GL_LIBS = -framework GLUT -framework OpenGL
 else
+  EXTRA_FLAGS += -I/usr/local/include
+  EXTRA_LIBS  += -L/usr/local/lib
  GL_LIBS = -lglut -lGL
 endif

@@ -167,6 +169,7 @@ DSP_DEC_OBJS = \
    src/dsp/yuv.o \
    src/dsp/yuv_mips32.o \
    src/dsp/yuv_mips_dsp_r2.o \
+    src/dsp/yuv_neon.o \
    src/dsp/yuv_sse2.o \

 DSP_ENC_OBJS = \
@@ -192,10 +195,13 @@ DSP_ENC_OBJS = \
    src/dsp/lossless_enc_neon.o \
    src/dsp/lossless_enc_sse2.o \
    src/dsp/lossless_enc_sse41.o \
+    src/dsp/ssim.o \
+    src/dsp/ssim_sse2.o \

 ENC_OBJS = \
    src/enc/alpha_enc.o \
    src/enc/analysis_enc.o \
+    src/enc/backward_references_cost_enc.o \
    src/enc/backward_references_enc.o \
    src/enc/config_enc.o \
    src/enc/cost_enc.o \
@@ -223,6 +229,7 @@ EX_FORMAT_DEC_OBJS = \
    imageio/jpegdec.o \
    imageio/metadata.o \
    imageio/pngdec.o \
+    imageio/pnmdec.o \
    imageio/tiffdec.o \
    imageio/webpdec.o \

@@ -328,8 +335,8 @@ OUT_LIBS += src/libwebp.a
 EXTRA_LIB = extras/libwebpextras.a
 OUT_EXAMPLES = examples/cwebp examples/dwebp
 EXTRA_EXAMPLES = examples/gif2webp examples/vwebp examples/webpmux \
-                 examples/anim_diff examples/img2webp
-OTHER_EXAMPLES = extras/get_disto extras/webp_quality
+                 examples/anim_diff examples/img2webp examples/webpinfo
+OTHER_EXAMPLES = extras/get_disto extras/webp_quality extras/vwebp_sdl

 OUTPUT = $(OUT_LIBS) $(OUT_EXAMPLES)
 ifeq ($(MAKECMDGOALS),clean)
@@ -380,6 +387,7 @@ examples/gif2webp: examples/gif2webp.o $(GIFDEC_OBJS)
 examples/vwebp: examples/vwebp.o
 examples/webpmux: examples/webpmux.o
 examples/img2webp: examples/img2webp.o
+examples/webpinfo: examples/webpinfo.o

 examples/anim_diff: examples/libanim_util.a examples/libgifdec.a
 examples/anim_diff: src/demux/libwebpdemux.a examples/libexample_util.a
@@ -411,6 +419,8 @@ examples/img2webp: examples/libexample_util.a imageio/libimageio_util.a
 examples/img2webp: imageio/libimagedec.a
 examples/img2webp: src/mux/libwebpmux.a src/libwebp.a
 examples/img2webp: EXTRA_LIBS += $(CWEBP_LIBS)
+examples/webpinfo: examples/libexample_util.a imageio/libimageio_util.a
+examples/webpinfo: src/libwebpdecoder.a

 extras/get_disto: extras/get_disto.o
 extras/get_disto: imageio/libimagedec.a imageio/libimageio_util.a src/libwebp.a
@@ -420,6 +430,13 @@ extras/webp_quality: extras/webp_quality.o
 extras/webp_quality: imageio/libimageio_util.a
 extras/webp_quality: $(EXTRA_LIB) src/libwebp.a

+extras/vwebp_sdl: extras/vwebp_sdl.o
+extras/vwebp_sdl: extras/webp_to_sdl.o
+extras/vwebp_sdl: imageio/libimageio_util.a
+extras/vwebp_sdl: src/libwebp.a
+extras/vwebp_sdl: EXTRA_FLAGS += -DWEBP_HAVE_SDL
+extras/vwebp_sdl: EXTRA_LIBS += -lSDL
+
 $(OUT_EXAMPLES) $(EXTRA_EXAMPLES) $(OTHER_EXAMPLES):
 	$(CC) -o $@ $^ $(LDFLAGS)

@@ -435,7 +452,7 @@ dist: all
 	$(INSTALL) -m644 src/mux/libwebpmux.a $(DESTDIR)/lib
 	umask 022; \
 	for m in man/[cdv]webp.1 man/gif2webp.1 man/webpmux.1 \
-                 man/img2webp.1; do \
+                 man/img2webp.1 man/webpinfo.1; do \
 	  basenam=$$(basename $$m .1); \
 	  $(GROFF) -t -e -man -T utf8 $$m \
 	    | $(COL) -bx >$(DESTDIR)/doc/$${basenam}.txt; \
--- a/man/Makefile.am
+++ b/man/Makefile.am
@@ -8,4 +8,7 @@ endif
 if BUILD_VWEBP
  man_MANS += vwebp.1
 endif
+if BUILD_WEBPINFO
+  man_MANS += webpinfo.1
+endif
 EXTRA_DIST = $(man_MANS)
--- a/man/cwebp.1
+++ b/man/cwebp.1
@@ -98,8 +98,7 @@ Crop the source to a rectangle with top\-left corner at coordinates
 This cropping area must be fully contained within the source rectangle.
 .TP
 .B \-mt
-Use multi\-threading for encoding, if possible. This option is only effective
-when using lossy compression on a source with a transparency channel.
+Use multi\-threading for encoding, if possible.
 .TP
 .B \-low_memory
 Reduce memory usage of lossy encoding by saving four times the compressed
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@@ -108,8 +108,7 @@ the value the smoother the picture will appear. Typical values are usually in
 the range of 20 to 50.
 .TP
 .B \-mt
-Use multi-threading for encoding, if possible. This option is only effective
-when using lossy compression.
+Use multi-threading for encoding, if possible.
 .TP
 .B \-v
 Print extra information.
--- a/man/webpinfo.1
+++ b/man/webpinfo.1
@@ -0,0 +1,77 @@
+.\"                                      Hey, EMACS: -*- nroff -*-
+.TH WEBPINFO 1 "May 08, 2017"
+.SH NAME
+webpinfo \- print out the chunk level structure of WebP files
+along with basic integrity checks.
+.SH SYNOPSIS
+.B webpinfo
+.I OPTIONS
+.I INPUT
+.br
+.B webpinfo [\-h|\-help|\-H|\-longhelp]
+.br
+
+.SH DESCRIPTION
+This manual page documents the
+.B webpinfo
+command.
+.PP
+\fBwebpinfo\fP can be used to print out the chunk level structure and bitstream
+header information of WebP files. It can also check if the files are of valid
+WebP format.
+
+.SH OPTIONS
+.TP
+.B -quiet
+Do not show chunk parsing information.
+.TP
+.B -diag
+Show parsing error diagnosis.
+.TP
+.B -summary
+Show chunk stats summary.
+.TP
+.BI -bitstream_info
+Parse bitstream header.
+.TP
+.B \-h, \-help
+A short usage summary.
+.TP
+.B \-H, \-longhelp
+Detailed usage instructions.
+
+.SH INPUT
+Input files in WebP format. Input files must come last, following
+options (if any). There can be multiple input files.
+
+.SH BUGS
+Please report all bugs to the issue tracker:
+https://bugs.chromium.org/p/webp
+.br
+Patches welcome! See this page to get started:
+http://www.webmproject.org/code/contribute/submitting\-patches/
+
+.SH EXAMPLES
+.br
+webpinfo \-h
+.br
+webpinfo \-diag \-summary input_file.webp
+.br
+webpinfo \-bitstream_info input_file_1.webp input_file_2.webp
+.br
+webpinfo *.webp
+
+.SH AUTHORS
+\fBwebpinfo\fP is a part of libwebp and was written by the WebP team.
+.br
+The latest source tree is available at
+https://chromium.googlesource.com/webm/libwebp
+.PP
+This manual page was written by Hui Su <huisu@google.com>,
+for the Debian project (and may be used by others).
+
+.SH SEE ALSO
+.BR webpmux (1)
+.br
+Please refer to http://developers.google.com/speed/webp/ for additional
+information.
--- a/src/dec/vp8_dec.h
+++ b/src/dec/vp8_dec.h
@@ -33,7 +33,7 @@ extern "C" {
 //   /* customize io's functions (setup()/put()/teardown()) if needed. */
 //
 //   VP8Decoder* dec = VP8New();
-//   bool ok = VP8Decode(dec);
+//   int ok = VP8Decode(dec, &io);
 //   if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
 //   VP8Delete(dec);
 //   return ok;
--- a/src/dec/vp8l_dec.c
+++ b/src/dec/vp8l_dec.c
@@ -1012,12 +1012,13 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
      ok = 0;
      goto End;
    }
-    assert(br->eos_ == VP8LIsEndOfStream(br));
+    br->eos_ = VP8LIsEndOfStream(br);
  }
  // Process the remaining rows corresponding to last row-block.
  ExtractPalettedAlphaRows(dec, row > last_row ? last_row : row);

 End:
+  br->eos_ = VP8LIsEndOfStream(br);
  if (!ok || (br->eos_ && pos < end)) {
    ok = 0;
    dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
@@ -1090,11 +1091,12 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
    VP8LFillBitWindow(br);
    if (htree_group->use_packed_table) {
      code = ReadPackedSymbols(htree_group, br, src);
+      if (VP8LIsEndOfStream(br)) break;
      if (code == PACKED_NON_LITERAL_CODE) goto AdvanceByOne;
    } else {
      code = ReadSymbol(htree_group->htrees[GREEN], br);
    }
-    if (br->eos_) break;  // early out
+    if (VP8LIsEndOfStream(br)) break;
    if (code < NUM_LITERAL_CODES) {  // Literal
      if (htree_group->is_trivial_literal) {
        *src = htree_group->literal_arb | (code << 8);
@@ -1104,7 +1106,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
        VP8LFillBitWindow(br);
        blue = ReadSymbol(htree_group->htrees[BLUE], br);
        alpha = ReadSymbol(htree_group->htrees[ALPHA], br);
-        if (br->eos_) break;
+        if (VP8LIsEndOfStream(br)) break;
        *src = ((uint32_t)alpha << 24) | (red << 16) | (code << 8) | blue;
      }
    AdvanceByOne:
@@ -1132,7 +1134,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
      VP8LFillBitWindow(br);
      dist_code = GetCopyDistance(dist_symbol, br);
      dist = PlaneCodeToDistance(width, dist_code);
-      if (br->eos_) break;
+      if (VP8LIsEndOfStream(br)) break;
      if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
        goto Error;
      } else {
@@ -1169,9 +1171,9 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
    } else {  // Not reached
      goto Error;
    }
-    assert(br->eos_ == VP8LIsEndOfStream(br));
  }

+  br->eos_ = VP8LIsEndOfStream(br);
  if (dec->incremental_ && br->eos_ && src < src_end) {
    RestoreState(dec);
  } else if (!br->eos_) {
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@@ -3,6 +3,7 @@ noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la
 noinst_LTLIBRARIES += libwebpdsp_sse41.la libwebpdspdecode_sse41.la
 noinst_LTLIBRARIES += libwebpdsp_neon.la libwebpdspdecode_neon.la
 noinst_LTLIBRARIES += libwebpdsp_msa.la libwebpdspdecode_msa.la
+noinst_LTLIBRARIES += libwebpdspdecode_wasm.la

 if BUILD_LIBWEBPDECODER
  noinst_LTLIBRARIES += libwebpdspdecode.la
@@ -50,6 +51,7 @@ ENC_SOURCES += enc_mips_dsp_r2.c
 ENC_SOURCES += lossless_enc.c
 ENC_SOURCES += lossless_enc_mips32.c
 ENC_SOURCES += lossless_enc_mips_dsp_r2.c
+ENC_SOURCES += ssim.c

 libwebpdsp_avx2_la_SOURCES =
 libwebpdsp_avx2_la_SOURCES += enc_avx2.c
@@ -81,6 +83,7 @@ libwebpdspdecode_neon_la_SOURCES += lossless_neon.c
 libwebpdspdecode_neon_la_SOURCES += neon.h
 libwebpdspdecode_neon_la_SOURCES += rescaler_neon.c
 libwebpdspdecode_neon_la_SOURCES += upsampling_neon.c
+libwebpdspdecode_neon_la_SOURCES += yuv_neon.c
 libwebpdspdecode_neon_la_CPPFLAGS = $(libwebpdsp_neon_la_CPPFLAGS)
 libwebpdspdecode_neon_la_CFLAGS = $(libwebpdsp_neon_la_CFLAGS)

@@ -94,11 +97,16 @@ libwebpdspdecode_msa_la_SOURCES += upsampling_msa.c
 libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_msa_la_CPPFLAGS)
 libwebpdspdecode_msa_la_CFLAGS = $(libwebpdsp_msa_la_CFLAGS)

+# WASM is not fully integrated into configure; the addition here keeps source
+# extraction by cmake simple.
+libwebpdspdecode_wasm_la_SOURCES = dec_wasm.c
+
 libwebpdsp_sse2_la_SOURCES =
 libwebpdsp_sse2_la_SOURCES += argb_sse2.c
 libwebpdsp_sse2_la_SOURCES += cost_sse2.c
 libwebpdsp_sse2_la_SOURCES += enc_sse2.c
 libwebpdsp_sse2_la_SOURCES += lossless_enc_sse2.c
+libwebpdsp_sse2_la_SOURCES += ssim_sse2.c
 libwebpdsp_sse2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_sse2_la_CFLAGS = $(AM_CFLAGS) $(SSE2_FLAGS)
 libwebpdsp_sse2_la_LIBADD = libwebpdspdecode_sse2.la
--- a/src/dsp/argb_sse2.c
+++ b/src/dsp/argb_sse2.c
@@ -12,6 +12,7 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include "./dsp.h"
+#include "./lossless.h"

 #if defined(WEBP_USE_SSE2)

@@ -19,30 +20,13 @@
 #include <emmintrin.h>
 #include <string.h>

-static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
-  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
-}
-
 static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
                     const uint8_t* b, int len, uint32_t* out) {
+  (void)a;
  if (g == r + 1) {  // RGBA input order. Need to swap R and B.
-    int i = 0;
-    const int len_max = len & ~3;  // max length processed in main loop
-    const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
    assert(b == r + 2);
    assert(a == r + 3);
-    for (; i < len_max; i += 4) {
-      const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
-      const __m128i B = _mm_and_si128(A, red_blue_mask);     // R 0 B 0
-      const __m128i C = _mm_andnot_si128(red_blue_mask, A);  // 0 G 0 A
-      const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
-      const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
-      const __m128i F = _mm_or_si128(E, C);
-      _mm_storeu_si128((__m128i*)(out + i), F);
-    }
-    for (; i < len; ++i) {
-      out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
-    }
+    VP8LConvertBGRAToRGBA((const uint32_t*)r, len, (uint8_t*)out);
  } else {
    assert(g == b + 1);
    assert(r == b + 2);
@@ -55,8 +39,10 @@ static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
 // Entry point

 extern void VP8EncDspARGBInitSSE2(void);
+extern void VP8LDspInitSSE2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
+  VP8LDspInitSSE2();
  VP8PackARGB = PackARGB;
 }

--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@@ -23,11 +23,13 @@
 #endif

 //------------------------------------------------------------------------------
-// SSE2 detection.
+// x86/x86-64 micro-arch detection.
 //

+// skip x86 specific code for WASM builds
+#if defined(WEBP_USE_WASM)
 // apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
-#if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
+#elif (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
    "mov %%ebx, %%edi\n"
@@ -63,8 +65,10 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
 #define GetCPUInfo __cpuid
 #endif

+// skip xgetbv definition for WASM builds
+#if defined(WEBP_USE_WASM)
 // NaCl has no support for xgetbv or the raw opcode.
-#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+#elif !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
 static WEBP_INLINE uint64_t xgetbv(void) {
  const uint32_t ecx = 0;
  uint32_t eax, edx;
@@ -94,7 +98,19 @@ static WEBP_INLINE uint64_t xgetbv(void) {
 #define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
 #endif

-#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
+//------------------------------------------------------------------------------
+// Platform specific VP8CPUInfo functions.
+//
+
+// WASM needs to precede platform specific architecture checks as the defines
+// will still be present when building this target.
+#if defined(WEBP_USE_WASM)
+static int wasmCPUInfo(CPUFeature feature) {
+  if (feature != kWASM) return 0;
+  return 1;
+}
+VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
+#elif defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)

 // helper function for run-time detection of slow SSSE3 platforms
 static int CheckSlowModel(int info) {
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@@ -700,6 +700,7 @@ extern void VP8DspInitNEON(void);
 extern void VP8DspInitMIPS32(void);
 extern void VP8DspInitMIPSdspR2(void);
 extern void VP8DspInitMSA(void);
+extern void VP8DspInitWASM(void);

 static volatile VP8CPUInfo dec_last_cpuinfo_used =
    (VP8CPUInfo)&dec_last_cpuinfo_used;
@@ -789,6 +790,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
    if (VP8GetCPUInfo(kMSA)) {
      VP8DspInitMSA();
    }
+#endif
+#if defined(WEBP_USE_WASM)
+    if (VP8GetCPUInfo(kWASM)) {
+      VP8DspInitWASM();
+    }
 #endif
  }
  dec_last_cpuinfo_used = VP8GetCPUInfo;
--- a/src/dsp/dec_msa.c
+++ b/src/dsp/dec_msa.c
@@ -222,6 +222,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
  const v16i8 cnst4b = __msa_ldi_b(4);                        \
  const v16i8 cnst3b = __msa_ldi_b(3);                        \
  const v8i16 cnst9h = __msa_ldi_h(9);                        \
+  const v8i16 cnst63h = __msa_ldi_h(63);                      \
                                                              \
  FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m);         \
  filt = __msa_subs_s_b(p1_m, q1_m);                          \
@@ -241,9 +242,9 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
  ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);               \
  /* update q2/p2 */                                          \
  temp0 = filt_r * cnst9h;                                    \
-  temp1 = ADDVI_H(temp0, 63);                                 \
+  temp1 = temp0 + cnst63h;                                    \
  temp2 = filt_l * cnst9h;                                    \
-  temp3 = ADDVI_H(temp2, 63);                                 \
+  temp3 = temp2 + cnst63h;                                    \
  FILT2(q2_m, p2_m, q2, p2);                                  \
  /* update q1/p1 */                                          \
  temp1 = temp1 + temp0;                                      \
@@ -708,7 +709,7 @@ static void VE4(uint8_t* dst) {    // vertical
  const uint32_t val0 = LW(ptop + 0);
  const uint32_t val1 = LW(ptop + 4);
  uint32_t out;
-  v16u8 A, B, C, AC, B2, R;
+  v16u8 A = { 0 }, B, C, AC, B2, R;

  INSERT_W2_UB(val0, val1, A);
  B = SLDI_UB(A, A, 1);
@@ -725,7 +726,7 @@ static void RD4(uint8_t* dst) {   // Down-right
  uint32_t val0 = LW(ptop + 0);
  uint32_t val1 = LW(ptop + 4);
  uint32_t val2, val3;
-  v16u8 A, B, C, AC, B2, R, A1;
+  v16u8 A, B, C, AC, B2, R, A1 = { 0 };

  INSERT_W2_UB(val0, val1, A1);
  A = SLDI_UB(A1, A1, 12);
@@ -753,7 +754,7 @@ static void LD4(uint8_t* dst) {   // Down-Left
  uint32_t val0 = LW(ptop + 0);
  uint32_t val1 = LW(ptop + 4);
  uint32_t val2, val3;
-  v16u8 A, B, C, AC, B2, R;
+  v16u8 A = { 0 }, B, C, AC, B2, R;

  INSERT_W2_UB(val0, val1, A);
  B = SLDI_UB(A, A, 1);
--- a/src/dsp/dec_wasm.c
+++ b/src/dsp/dec_wasm.c
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -38,10 +38,23 @@ extern "C" {
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif

+#if defined(__clang__)
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif
+
 #ifndef __has_builtin
 # define __has_builtin(x) 0
 #endif

+// For now, none of the optimizations below are available in emscripten.
+// WebAssembly overrides native optimizations.
+#if !(defined(EMSCRIPTEN) || defined(WEBP_USE_WASM))
+
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
@@ -68,18 +81,20 @@ extern "C" {
 #define WEBP_USE_AVX2
 #endif

-#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
-#define WEBP_ANDROID_NEON  // Android targets that might support NEON
-#endif
-
 // The intrinsics currently cause compiler errors with arm-nacl-gcc and the
 // inline assembly would need to be modified for use with Native Client.
-#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
+#if (defined(__ARM_NEON__) || \
     defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
    !defined(__native_client__)
 #define WEBP_USE_NEON
 #endif

+#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
+    defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
+#define WEBP_ANDROID_NEON  // Android targets that may have NEON
+#define WEBP_USE_NEON
+#endif
+
 #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
 #define WEBP_USE_NEON
 #define WEBP_USE_INTRINSICS
@@ -100,6 +115,8 @@ extern "C" {
 #define WEBP_USE_MSA
 #endif

+#endif  /* EMSCRIPTEN */
+
 // This macro prevents thread_sanitizer from reporting known concurrent writes.
 #define WEBP_TSAN_IGNORE_FUNCTION
 #if defined(__has_feature)
@@ -139,7 +156,8 @@ typedef enum {
  kNEON,
  kMIPS32,
  kMIPSdspR2,
-  kMSA
+  kMSA,
+  kWASM
 } CPUFeature;
 // returns true if the CPU supports the feature.
 typedef int (*VP8CPUInfo)(CPUFeature feature);
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@@ -690,140 +690,6 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) {
  Copy(src, dst, 16, 8);
 }

-//------------------------------------------------------------------------------
-// SSIM / PSNR
-
-// hat-shaped filter. Sum of coefficients is equal to 16.
-static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
-  1, 2, 3, 4, 3, 2, 1
-};
-static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
-
-static WEBP_INLINE double SSIMCalculation(
-    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
-  const uint32_t w2 =  N * N;
-  const uint32_t C1 = 20 * w2;
-  const uint32_t C2 = 60 * w2;
-  const uint32_t C3 = 8 * 8 * w2;   // 'dark' limit ~= 6
-  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
-  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
-  if (xmxm + ymym >= C3) {
-    const int64_t xmym = (int64_t)stats->xm * stats->ym;
-    const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
-    const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
-    const uint64_t syy = (uint64_t)stats->yym * N - ymym;
-    // we descale by 8 to prevent overflow during the fnum/fden multiply.
-    const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
-    const uint64_t den_S = (sxx + syy + C2) >> 8;
-    const uint64_t fnum = (2 * xmym + C1) * num_S;
-    const uint64_t fden = (xmxm + ymym + C1) * den_S;
-    const double r = (double)fnum / fden;
-    assert(r >= 0. && r <= 1.0);
-    return r;
-  }
-  return 1.;   // area is too dark to contribute meaningfully
-}
-
-double VP8SSIMFromStats(const VP8DistoStats* const stats) {
-  return SSIMCalculation(stats, kWeightSum);
-}
-
-double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
-  return SSIMCalculation(stats, stats->w);
-}
-
-static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
-                               const uint8_t* src2, int stride2,
-                               int xo, int yo, int W, int H) {
-  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
-  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
-  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
-                                                  : yo + VP8_SSIM_KERNEL;
-  const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
-  const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
-                                                  : xo + VP8_SSIM_KERNEL;
-  int x, y;
-  src1 += ymin * stride1;
-  src2 += ymin * stride2;
-  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
-    for (x = xmin; x <= xmax; ++x) {
-      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
-                       * kWeight[VP8_SSIM_KERNEL + y - yo];
-      const uint32_t s1 = src1[x];
-      const uint32_t s2 = src2[x];
-      stats.w   += w;
-      stats.xm  += w * s1;
-      stats.ym  += w * s2;
-      stats.xxm += w * s1 * s1;
-      stats.xym += w * s1 * s2;
-      stats.yym += w * s2 * s2;
-    }
-  }
-  return VP8SSIMFromStatsClipped(&stats);
-}
-
-static double SSIMGet_C(const uint8_t* src1, int stride1,
-                        const uint8_t* src2, int stride2) {
-  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
-  int x, y;
-  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
-    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
-      const uint32_t w = kWeight[x] * kWeight[y];
-      const uint32_t s1 = src1[x];
-      const uint32_t s2 = src2[x];
-      stats.xm  += w * s1;
-      stats.ym  += w * s2;
-      stats.xxm += w * s1 * s1;
-      stats.xym += w * s1 * s2;
-      stats.yym += w * s2 * s2;
-    }
-  }
-  return VP8SSIMFromStats(&stats);
-}
-
-//------------------------------------------------------------------------------
-
-static uint32_t AccumulateSSE(const uint8_t* src1,
-                              const uint8_t* src2, int len) {
-  int i;
-  uint32_t sse2 = 0;
-  assert(len <= 65535);  // to ensure that accumulation fits within uint32_t
-  for (i = 0; i < len; ++i) {
-    const int32_t diff = src1[i] - src2[i];
-    sse2 += diff * diff;
-  }
-  return sse2;
-}
-
-//------------------------------------------------------------------------------
-
-VP8SSIMGetFunc VP8SSIMGet;
-VP8SSIMGetClippedFunc VP8SSIMGetClipped;
-VP8AccumulateSSEFunc VP8AccumulateSSE;
-
-extern void VP8SSIMDspInitSSE2(void);
-
-static volatile VP8CPUInfo ssim_last_cpuinfo_used =
-    (VP8CPUInfo)&ssim_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
-  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
-
-  VP8SSIMGetClipped = SSIMGetClipped_C;
-  VP8SSIMGet = SSIMGet_C;
-
-  VP8AccumulateSSE = AccumulateSSE;
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      VP8SSIMDspInitSSE2();
-    }
-#endif
-  }
-
-  ssim_last_cpuinfo_used = VP8GetCPUInfo;
-}
-
 //------------------------------------------------------------------------------
 // Initialization

--- a/src/dsp/enc_msa.c
+++ b/src/dsp/enc_msa.c
@@ -82,7 +82,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  uint32_t in0, in1, in2, in3;
  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  v8i16 t0, t1, t2, t3;
-  v16u8 srcl0, srcl1, src0, src1;
+  v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
  const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
  const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
  const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
@@ -170,7 +170,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
 static int TTransform(const uint8_t* in, const uint16_t* w) {
  int sum;
  uint32_t in0_m, in1_m, in2_m, in3_m;
-  v16i8 src0;
+  v16i8 src0 = { 0 };
  v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
  v4i32 dst0, dst1;
  const v16i8 zero = { 0 };
@@ -259,8 +259,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)

 static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+  const v16u8 A1 = { 0 };
  const uint64_t val_m = LD(top - 1);
-  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
  const v16u8 B = SLDI_UB(A, A, 1);
  const v16u8 C = SLDI_UB(A, A, 2);
  const v16u8 AC = __msa_ave_u_b(A, C);
@@ -292,8 +293,9 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
 }

 static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
+  const v16u8 A2 = { 0 };
  const uint64_t val_m = LD(top - 5);
-  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
  const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
  const v16u8 B = SLDI_UB(A, A, 1);
  const v16u8 C = SLDI_UB(A, A, 2);
@@ -311,8 +313,9 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
 }

 static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
+  const v16u8 A1 = { 0 };
  const uint64_t val_m = LD(top);
-  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
  const v16u8 B = SLDI_UB(A, A, 1);
  const v16u8 C1 = SLDI_UB(A, A, 2);
  const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
@@ -645,7 +648,7 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
 static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
                                  const uint8_t* top) {
  uint64_t out;
-  v16u8 src;
+  v16u8 src = { 0 };
  if (top != NULL && left != NULL) {
    const uint64_t left_m = LD(left);
    const uint64_t top_m = LD(top);
@@ -777,7 +780,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
 static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  uint32_t sum = 0;
  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v16u8 src, ref, tmp0, tmp1;
+  v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
  v8i16 diff0, diff1;
  v4i32 out0, out1;

@@ -828,7 +831,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  tmp1 = (tmp3 > maxlevel);
  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
-  SUB2(0, tmp2, 0, tmp3, tmp0, tmp1);
+  SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
  LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3);   // zthresh
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@@ -1366,119 +1366,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
  VP8Mean16x4 = Mean16x4;
 }

-//------------------------------------------------------------------------------
-// SSIM / PSNR entry point (TODO(skal): move to its own file later)
-
-static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
-                                   const uint8_t* src2, int len) {
-  int i = 0;
-  uint32_t sse2 = 0;
-  if (len >= 16) {
-    const int limit = len - 32;
-    int32_t tmp[4];
-    __m128i sum1;
-    __m128i sum = _mm_setzero_si128();
-    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
-    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
-    i += 16;
-    while (i <= limit) {
-      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
-      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
-      __m128i sum2;
-      i += 16;
-      SubtractAndAccumulate(a0, b0, &sum1);
-      sum = _mm_add_epi32(sum, sum1);
-      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
-      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
-      i += 16;
-      SubtractAndAccumulate(a1, b1, &sum2);
-      sum = _mm_add_epi32(sum, sum2);
-    }
-    SubtractAndAccumulate(a0, b0, &sum1);
-    sum = _mm_add_epi32(sum, sum1);
-    _mm_storeu_si128((__m128i*)tmp, sum);
-    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
-  }
-
-  for (; i < len; ++i) {
-    const int32_t diff = src1[i] - src2[i];
-    sse2 += diff * diff;
-  }
-  return sse2;
-}
-
-static uint32_t HorizontalAdd16b(const __m128i* const m) {
-  uint16_t tmp[8];
-  const __m128i a = _mm_srli_si128(*m, 8);
-  const __m128i b = _mm_add_epi16(*m, a);
-  _mm_storeu_si128((__m128i*)tmp, b);
-  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
-}
-
-static uint32_t HorizontalAdd32b(const __m128i* const m) {
-  const __m128i a = _mm_srli_si128(*m, 8);
-  const __m128i b = _mm_add_epi32(*m, a);
-  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
-  return (uint32_t)_mm_cvtsi128_si32(c);
-}
-
-static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
-
-#define ACCUMULATE_ROW(WEIGHT) do {                         \
-  /* compute row weight (Wx * Wy) */                        \
-  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
-  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
-  /* process 8 bytes at a time (7 bytes, actually) */       \
-  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
-  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
-  /* convert to 16b and multiply by weight */               \
-  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
-  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
-  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
-  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
-  /* accumulate */                                          \
-  xm  = _mm_add_epi16(xm, wa1);                             \
-  ym  = _mm_add_epi16(ym, wb1);                             \
-  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
-  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
-  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
-  src1 += stride1;                                          \
-  src2 += stride2;                                          \
-} while (0)
-
-static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
-                           const uint8_t* src2, int stride2) {
-  VP8DistoStats stats;
-  const __m128i zero = _mm_setzero_si128();
-  __m128i xm = zero, ym = zero;                // 16b accums
-  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
-  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
-  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
-  ACCUMULATE_ROW(1);
-  ACCUMULATE_ROW(2);
-  ACCUMULATE_ROW(3);
-  ACCUMULATE_ROW(4);
-  ACCUMULATE_ROW(3);
-  ACCUMULATE_ROW(2);
-  ACCUMULATE_ROW(1);
-  stats.xm  = HorizontalAdd16b(&xm);
-  stats.ym  = HorizontalAdd16b(&ym);
-  stats.xxm = HorizontalAdd32b(&xxm);
-  stats.xym = HorizontalAdd32b(&xym);
-  stats.yym = HorizontalAdd32b(&yym);
-  return VP8SSIMFromStats(&stats);
-}
-
-extern void VP8SSIMDspInitSSE2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
-  VP8AccumulateSSE = AccumulateSSE_SSE2;
-  VP8SSIMGet = SSIMGet_SSE2;
-}
-
 #else  // !WEBP_USE_SSE2

 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
-WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)

 #endif  // WEBP_USE_SSE2
--- a/src/dsp/lossless_common.h
+++ b/src/dsp/lossless_common.h
@@ -93,14 +93,6 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
 // -----------------------------------------------------------------------------
 // PrefixEncode()

-static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
-  const int log_floor = BitsLog2Floor(n);
-  if (n == (n & ~(n - 1))) {  // zero or a power of two.
-    return log_floor;
-  }
-  return log_floor + 1;
-}
-
 // Splitting of distance and length codes into prefixes and
 // extra bits. The prefixes are encoded with an entropy code
 // while the extra bits are stored just as normal bits.
--- a/src/dsp/lossless_enc.c
+++ b/src/dsp/lossless_enc.c
@@ -520,8 +520,8 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
    const uint32_t argb = data[i];
    const uint32_t green = argb >> 8;
    const uint32_t red = argb >> 16;
-    int new_red = red;
-    int new_blue = argb;
+    int new_red = red & 0xff;
+    int new_blue = argb & 0xff;
    new_red -= ColorTransformDelta(m->green_to_red_, green);
    new_red &= 0xff;
    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
--- a/src/dsp/lossless_msa.c
+++ b/src/dsp/lossless_msa.c
@@ -43,7 +43,7 @@

 #define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do {         \
  uint64_t pix_d;                                          \
-  v16u8 src0, src1, src2, dst0, dst1;                      \
+  v16u8 src0, src1, src2 = { 0 }, dst0, dst1;              \
  LD_UB2(psrc, 16, src0, src1);                            \
  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
  ST_UB(dst0, pdst);                                       \
--- a/src/dsp/lossless_sse2.c
+++ b/src/dsp/lossless_sse2.c
@@ -272,9 +272,24 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 #undef GENERATE_PREDICTOR_2

 // Predictor10: average of (average of (L,TL), average of (T, TR)).
+#define DO_PRED10(OUT) do {               \
+  __m128i avgLTL, avg;                    \
+  Average2_m128i(&L, &TL, &avgLTL);       \
+  Average2_m128i(&avgTTR, &avgLTL, &avg); \
+  L = _mm_add_epi8(avg, src);             \
+  out[i + (OUT)] = _mm_cvtsi128_si32(L);  \
+} while (0)
+
+#define DO_PRED10_SHIFT do {                                  \
+  /* Rotate the pre-computed values for the next iteration.*/ \
+  avgTTR = _mm_srli_si128(avgTTR, 4);                         \
+  TL = _mm_srli_si128(TL, 4);                                 \
+  src = _mm_srli_si128(src, 4);                               \
+} while (0)
+
 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
-  int i, j;
+  int i;
  __m128i L = _mm_cvtsi32_si128(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
@@ -283,77 +298,88 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
    const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
    __m128i avgTTR;
    Average2_m128i(&T, &TR, &avgTTR);
-    for (j = 0; j < 4; ++j) {
-      __m128i avgLTL, avg;
-      Average2_m128i(&L, &TL, &avgLTL);
-      Average2_m128i(&avgTTR, &avgLTL, &avg);
-      L = _mm_add_epi8(avg, src);
-      out[i + j] = _mm_cvtsi128_si32(L);
-      // Rotate the pre-computed values for the next iteration.
-      avgTTR = _mm_srli_si128(avgTTR, 4);
-      TL = _mm_srli_si128(TL, 4);
-      src = _mm_srli_si128(src, 4);
-    }
+    DO_PRED10(0);
+    DO_PRED10_SHIFT;
+    DO_PRED10(1);
+    DO_PRED10_SHIFT;
+    DO_PRED10(2);
+    DO_PRED10_SHIFT;
+    DO_PRED10(3);
  }
  if (i != num_pixels) {
    VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
  }
 }
+#undef DO_PRED10
+#undef DO_PRED10_SHIFT

 // Predictor11: select.
-static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
-                            __m128i* const out) {
-  // We can unpack with any value on the upper 32 bits, provided it's the same
-  // on both operands (to that their sum of abs diff is zero). Here we use *A.
-  const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
-  const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
-  const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
-  const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
-  const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
-  const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
-  *out = _mm_packs_epi32(s_lo, s_hi);
-}
+#define DO_PRED11(OUT) do {                                            \
+  const __m128i L_lo = _mm_unpacklo_epi32(L, T);                       \
+  const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);                     \
+  const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/   \
+  const __m128i mask = _mm_cmpgt_epi32(pb, pa);                        \
+  const __m128i A = _mm_and_si128(mask, L);                            \
+  const __m128i B = _mm_andnot_si128(mask, T);                         \
+  const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
+  L = _mm_add_epi8(src, pred);                                         \
+  out[i + (OUT)] = _mm_cvtsi128_si32(L);                               \
+} while (0)
+
+#define DO_PRED11_SHIFT do {                                \
+  /* Shift the pre-computed value for the next iteration.*/ \
+  T = _mm_srli_si128(T, 4);                                 \
+  TL = _mm_srli_si128(TL, 4);                               \
+  src = _mm_srli_si128(src, 4);                             \
+  pa = _mm_srli_si128(pa, 4);                               \
+} while (0)

 static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
-  int i, j;
+  int i;
+  __m128i pa;
  __m128i L = _mm_cvtsi32_si128(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
-    __m128i pa;
-    GetSumAbsDiff32(&T, &TL, &pa);   // pa = sum |T-TL|
-    for (j = 0; j < 4; ++j) {
-      const __m128i L_lo = _mm_unpacklo_epi32(L, L);
-      const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
-      const __m128i pb = _mm_sad_epu8(L_lo, TL_lo);  // pb = sum |L-TL|
-      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
-      const __m128i A = _mm_and_si128(mask, L);
-      const __m128i B = _mm_andnot_si128(mask, T);
-      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
-      L = _mm_add_epi8(src, pred);
-      out[i + j] = _mm_cvtsi128_si32(L);
-      // Shift the pre-computed value for the next iteration.
-      T = _mm_srli_si128(T, 4);
-      TL = _mm_srli_si128(TL, 4);
-      src = _mm_srli_si128(src, 4);
-      pa = _mm_srli_si128(pa, 4);
+    {
+      // We can unpack with any value on the upper 32 bits, provided it's the
+      // same on both operands (so that their sum of abs diff is zero). Here we
+      // use T.
+      const __m128i T_lo = _mm_unpacklo_epi32(T, T);
+      const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
+      const __m128i T_hi = _mm_unpackhi_epi32(T, T);
+      const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
+      const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
+      const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
+      pa = _mm_packs_epi32(s_lo, s_hi);  // pa = sum |T-TL|
    }
+    DO_PRED11(0);
+    DO_PRED11_SHIFT;
+    DO_PRED11(1);
+    DO_PRED11_SHIFT;
+    DO_PRED11(2);
+    DO_PRED11_SHIFT;
+    DO_PRED11(3);
  }
  if (i != num_pixels) {
    VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
  }
 }
+#undef DO_PRED11
+#undef DO_PRED11_SHIFT

 // Predictor12: ClampedAddSubtractFull.
-#define DO_PRED12(DIFF, LANE, OUT)                          \
-do {                                                        \
-  const __m128i all = _mm_add_epi16(L, (DIFF));             \
-  const __m128i alls = _mm_packus_epi16(all, all);          \
-  const __m128i res = _mm_add_epi8(src, alls);              \
-  out[i + (OUT)] = _mm_cvtsi128_si32(res);                  \
-  L = _mm_unpacklo_epi8(res, zero);                         \
+#define DO_PRED12(DIFF, LANE, OUT) do {            \
+  const __m128i all = _mm_add_epi16(L, (DIFF));    \
+  const __m128i alls = _mm_packus_epi16(all, all); \
+  const __m128i res = _mm_add_epi8(src, alls);     \
+  out[i + (OUT)] = _mm_cvtsi128_si32(res);         \
+  L = _mm_unpacklo_epi8(res, zero);                \
+} while (0)
+
+#define DO_PRED12_SHIFT(DIFF, LANE) do {                    \
  /* Shift the pre-computed value for the next iteration.*/ \
  if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8);        \
  src = _mm_srli_si128(src, 4);                             \
@@ -377,8 +403,11 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
    __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
    __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
    DO_PRED12(diff_lo, 0, 0);
+    DO_PRED12_SHIFT(diff_lo, 0);
    DO_PRED12(diff_lo, 1, 1);
+    DO_PRED12_SHIFT(diff_lo, 1);
    DO_PRED12(diff_hi, 0, 2);
+    DO_PRED12_SHIFT(diff_hi, 0);
    DO_PRED12(diff_hi, 1, 3);
  }
  if (i != num_pixels) {
@@ -386,6 +415,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
  }
 }
 #undef DO_PRED12
+#undef DO_PRED12_SHIFT

 // Due to averages with integers, values cannot be accumulated in parallel for
 // predictors 13.
@@ -492,25 +522,24 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,

 static void ConvertBGRAToRGBA(const uint32_t* src,
                              int num_pixels, uint8_t* dst) {
+  const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  while (num_pixels >= 8) {
-    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
-    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
-    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
-    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
-    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);   // b0b2b4b6g0g2g4g6...
-    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);   // b1b3b5b7g1g3g5g7...
-    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);   // b0...b7 | g0...g7
-    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);   // r0...r7 | a0...a7
-    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);  // g0...g7 | a0...a7
-    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);  // r0...r7 | b0...b7
-    const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0);   // r0g0r1g1 ... r6g6r7g7
-    const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0);   // b0a0b1a1 ... b6a6b7a7
-    const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0);  // rgba0|rgba1...
-    const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0);  // rgba4|rgba5...
-    _mm_storeu_si128(out++, rgba0);
-    _mm_storeu_si128(out++, rgba4);
+    const __m128i A1 = _mm_loadu_si128(in++);
+    const __m128i A2 = _mm_loadu_si128(in++);
+    const __m128i B1 = _mm_and_si128(A1, red_blue_mask);     // R 0 B 0
+    const __m128i B2 = _mm_and_si128(A2, red_blue_mask);     // R 0 B 0
+    const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1);  // 0 G 0 A
+    const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2);  // 0 G 0 A
+    const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i F1 = _mm_or_si128(E1, C1);
+    const __m128i F2 = _mm_or_si128(E2, C2);
+    _mm_storeu_si128(out++, F1);
+    _mm_storeu_si128(out++, F2);
    num_pixels -= 8;
  }
  // left-overs
--- a/src/dsp/msa_macro.h
+++ b/src/dsp/msa_macro.h
@@ -22,6 +22,7 @@
 #endif

 #ifdef CLANG_BUILD
+  #define ALPHAVAL  (-1)
  #define ADDVI_H(a, b)  __msa_addvi_h((v8i16)a, b)
  #define ADDVI_W(a, b)  __msa_addvi_w((v4i32)a, b)
  #define SRAI_B(a, b)  __msa_srai_b((v16i8)a, b)
@@ -32,6 +33,7 @@
  #define ANDI_B(a, b)  __msa_andi_b((v16u8)a, b)
  #define ORI_B(a, b)   __msa_ori_b((v16u8)a, b)
 #else
+  #define ALPHAVAL  (0xff)
  #define ADDVI_H(a, b)  (a + b)
  #define ADDVI_W(a, b)  (a + b)
  #define SRAI_B(a, b)  (a >> b)
--- a/src/dsp/neon.h
+++ b/src/dsp/neon.h
@@ -17,8 +17,9 @@
 #include "./dsp.h"

 // Right now, some intrinsics functions seem slower, so we disable them
-// everywhere except aarch64 where the inline assembly is incompatible.
-#if defined(__aarch64__)
+// everywhere except newer clang/gcc or aarch64 where the inline assembly is
+// incompatible.
+#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
 #define WEBP_USE_INTRINSICS   // use intrinsics when possible
 #endif

@@ -43,7 +44,7 @@
 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
 // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
-#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
 #define WORK_AROUND_GCC
 #endif

--- a/src/dsp/ssim.c
+++ b/src/dsp/ssim.c
@@ -0,0 +1,151 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>  // for abs()
+
+#include "./dsp.h"
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR
+
+// hat-shaped filter. Sum of coefficients is equal to 16.
+static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
+  1, 2, 3, 4, 3, 2, 1
+};
+static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
+
+static WEBP_INLINE double SSIMCalculation(
+    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
+  const uint32_t w2 =  N * N;
+  const uint32_t C1 = 20 * w2;
+  const uint32_t C2 = 60 * w2;
+  const uint32_t C3 = 8 * 8 * w2;   // 'dark' limit ~= 6
+  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
+  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
+  if (xmxm + ymym >= C3) {
+    const int64_t xmym = (int64_t)stats->xm * stats->ym;
+    const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
+    const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
+    const uint64_t syy = (uint64_t)stats->yym * N - ymym;
+    // we descale by 8 to prevent overflow during the fnum/fden multiply.
+    const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
+    const uint64_t den_S = (sxx + syy + C2) >> 8;
+    const uint64_t fnum = (2 * xmym + C1) * num_S;
+    const uint64_t fden = (xmxm + ymym + C1) * den_S;
+    const double r = (double)fnum / fden;
+    assert(r >= 0. && r <= 1.0);
+    return r;
+  }
+  return 1.;   // area is too dark to contribute meaningfully
+}
+
+double VP8SSIMFromStats(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, kWeightSum);
+}
+
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, stats->w);
+}
+
+static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
+                               const uint8_t* src2, int stride2,
+                               int xo, int yo, int W, int H) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
+  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
+                                                  : yo + VP8_SSIM_KERNEL;
+  const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
+  const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
+                                                  : xo + VP8_SSIM_KERNEL;
+  int x, y;
+  src1 += ymin * stride1;
+  src2 += ymin * stride2;
+  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
+    for (x = xmin; x <= xmax; ++x) {
+      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
+                       * kWeight[VP8_SSIM_KERNEL + y - yo];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.w   += w;
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
+    }
+  }
+  return VP8SSIMFromStatsClipped(&stats);
+}
+
+static double SSIMGet_C(const uint8_t* src1, int stride1,
+                        const uint8_t* src2, int stride2) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+  int x, y;
+  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
+    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
+      const uint32_t w = kWeight[x] * kWeight[y];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
+    }
+  }
+  return VP8SSIMFromStats(&stats);
+}
+
+//------------------------------------------------------------------------------
+
+static uint32_t AccumulateSSE(const uint8_t* src1,
+                              const uint8_t* src2, int len) {
+  int i;
+  uint32_t sse2 = 0;
+  assert(len <= 65535);  // to ensure that accumulation fits within uint32_t
+  for (i = 0; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+
+//------------------------------------------------------------------------------
+
+VP8SSIMGetFunc VP8SSIMGet;
+VP8SSIMGetClippedFunc VP8SSIMGetClipped;
+VP8AccumulateSSEFunc VP8AccumulateSSE;
+
+extern void VP8SSIMDspInitSSE2(void);
+
+static volatile VP8CPUInfo ssim_last_cpuinfo_used =
+    (VP8CPUInfo)&ssim_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
+  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8SSIMGetClipped = SSIMGetClipped_C;
+  VP8SSIMGet = SSIMGet_C;
+
+  VP8AccumulateSSE = AccumulateSSE;
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8SSIMDspInitSSE2();
+    }
+#endif
+  }
+
+  ssim_last_cpuinfo_used = VP8GetCPUInfo;
+}
--- a/src/dsp/ssim_sse2.c
+++ b/src/dsp/ssim_sse2.c
@@ -0,0 +1,154 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "./common_sse2.h"
+
+// Helper function
+static WEBP_INLINE void SubtractAndSquare(const __m128i a, const __m128i b,
+                                          __m128i* const sum) {
+  // take abs(a-b) in 8b
+  const __m128i a_b = _mm_subs_epu8(a, b);
+  const __m128i b_a = _mm_subs_epu8(b, a);
+  const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
+  // zero-extend to 16b
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
+  const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
+  // multiply with self
+  const __m128i sum1 = _mm_madd_epi16(C0, C0);
+  const __m128i sum2 = _mm_madd_epi16(C1, C1);
+  *sum = _mm_add_epi32(sum1, sum2);
+}
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR entry point
+
+static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
+                                   const uint8_t* src2, int len) {
+  int i = 0;
+  uint32_t sse2 = 0;
+  if (len >= 16) {
+    const int limit = len - 32;
+    int32_t tmp[4];
+    __m128i sum1;
+    __m128i sum = _mm_setzero_si128();
+    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+    i += 16;
+    while (i <= limit) {
+      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      __m128i sum2;
+      i += 16;
+      SubtractAndSquare(a0, b0, &sum1);
+      sum = _mm_add_epi32(sum, sum1);
+      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      i += 16;
+      SubtractAndSquare(a1, b1, &sum2);
+      sum = _mm_add_epi32(sum, sum2);
+    }
+    SubtractAndSquare(a0, b0, &sum1);
+    sum = _mm_add_epi32(sum, sum1);
+    _mm_storeu_si128((__m128i*)tmp, sum);
+    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+  }
+
+  for (; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+
+static uint32_t HorizontalAdd16b(const __m128i* const m) {
+  uint16_t tmp[8];
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi16(*m, a);
+  _mm_storeu_si128((__m128i*)tmp, b);
+  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
+}
+
+static uint32_t HorizontalAdd32b(const __m128i* const m) {
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi32(*m, a);
+  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
+  return (uint32_t)_mm_cvtsi128_si32(c);
+}
+
+static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
+
+#define ACCUMULATE_ROW(WEIGHT) do {                         \
+  /* compute row weight (Wx * Wy) */                        \
+  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
+  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
+  /* process 8 bytes at a time (7 bytes, actually) */       \
+  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
+  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
+  /* convert to 16b and multiply by weight */               \
+  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
+  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
+  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
+  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
+  /* accumulate */                                          \
+  xm  = _mm_add_epi16(xm, wa1);                             \
+  ym  = _mm_add_epi16(ym, wb1);                             \
+  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
+  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
+  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
+  src1 += stride1;                                          \
+  src2 += stride2;                                          \
+} while (0)
+
+static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
+                           const uint8_t* src2, int stride2) {
+  VP8DistoStats stats;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i xm = zero, ym = zero;                // 16b accums
+  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
+  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
+  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
+  ACCUMULATE_ROW(1);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(4);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(1);
+  stats.xm  = HorizontalAdd16b(&xm);
+  stats.ym  = HorizontalAdd16b(&ym);
+  stats.xxm = HorizontalAdd32b(&xxm);
+  stats.xym = HorizontalAdd32b(&xym);
+  stats.yym = HorizontalAdd32b(&yym);
+  return VP8SSIMFromStats(&stats);
+}
+
+extern void VP8SSIMDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
+  VP8AccumulateSSE = AccumulateSSE_SSE2;
+  VP8SSIMGet = SSIMGet_SSE2;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@@ -93,13 +93,13 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
 }

 // All variants implemented.
-UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
-UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
-UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
-UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
-UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
-UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
-UPSAMPLE_FUNC(UpsampleRgb565LinePair,  VP8YuvToRgb565,  2)
+UPSAMPLE_FUNC(UpsampleRgbLinePair_C,  VP8YuvToRgb,  3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair_C,  VP8YuvToBgr,  3)
+UPSAMPLE_FUNC(UpsampleRgbaLinePair_C, VP8YuvToRgba, 4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair_C, VP8YuvToBgra, 4)
+UPSAMPLE_FUNC(UpsampleArgbLinePair_C, VP8YuvToArgb, 4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair_C, VP8YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair_C,  VP8YuvToRgb565,  2)

 #undef LOAD_UV
 #undef UPSAMPLE_FUNC
@@ -161,13 +161,13 @@ void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,           \
  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
 }

-YUV444_FUNC(WebPYuv444ToRgbC,      VP8YuvToRgb,  3)
-YUV444_FUNC(WebPYuv444ToBgrC,      VP8YuvToBgr,  3)
-YUV444_FUNC(WebPYuv444ToRgbaC,     VP8YuvToRgba, 4)
-YUV444_FUNC(WebPYuv444ToBgraC,     VP8YuvToBgra, 4)
-YUV444_FUNC(WebPYuv444ToArgbC,     VP8YuvToArgb, 4)
-YUV444_FUNC(WebPYuv444ToRgba4444C, VP8YuvToRgba4444, 2)
-YUV444_FUNC(WebPYuv444ToRgb565C,   VP8YuvToRgb565, 2)
+YUV444_FUNC(WebPYuv444ToRgb_C,      VP8YuvToRgb,  3)
+YUV444_FUNC(WebPYuv444ToBgr_C,      VP8YuvToBgr,  3)
+YUV444_FUNC(WebPYuv444ToRgba_C,     VP8YuvToRgba, 4)
+YUV444_FUNC(WebPYuv444ToBgra_C,     VP8YuvToBgra, 4)
+YUV444_FUNC(WebPYuv444ToArgb_C,     VP8YuvToArgb, 4)
+YUV444_FUNC(WebPYuv444ToRgba4444_C, VP8YuvToRgba4444, 2)
+YUV444_FUNC(WebPYuv444ToRgb565_C,   VP8YuvToRgb565, 2)

 #undef YUV444_FUNC

@@ -182,17 +182,17 @@ static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
  if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;

-  WebPYUV444Converters[MODE_RGB]       = WebPYuv444ToRgbC;
-  WebPYUV444Converters[MODE_RGBA]      = WebPYuv444ToRgbaC;
-  WebPYUV444Converters[MODE_BGR]       = WebPYuv444ToBgrC;
-  WebPYUV444Converters[MODE_BGRA]      = WebPYuv444ToBgraC;
-  WebPYUV444Converters[MODE_ARGB]      = WebPYuv444ToArgbC;
-  WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444C;
-  WebPYUV444Converters[MODE_RGB_565]   = WebPYuv444ToRgb565C;
-  WebPYUV444Converters[MODE_rgbA]      = WebPYuv444ToRgbaC;
-  WebPYUV444Converters[MODE_bgrA]      = WebPYuv444ToBgraC;
-  WebPYUV444Converters[MODE_Argb]      = WebPYuv444ToArgbC;
-  WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444C;
+  WebPYUV444Converters[MODE_RGB]       = WebPYuv444ToRgb_C;
+  WebPYUV444Converters[MODE_RGBA]      = WebPYuv444ToRgba_C;
+  WebPYUV444Converters[MODE_BGR]       = WebPYuv444ToBgr_C;
+  WebPYUV444Converters[MODE_BGRA]      = WebPYuv444ToBgra_C;
+  WebPYUV444Converters[MODE_ARGB]      = WebPYuv444ToArgb_C;
+  WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444_C;
+  WebPYUV444Converters[MODE_RGB_565]   = WebPYuv444ToRgb565_C;
+  WebPYUV444Converters[MODE_rgbA]      = WebPYuv444ToRgba_C;
+  WebPYUV444Converters[MODE_bgrA]      = WebPYuv444ToBgra_C;
+  WebPYUV444Converters[MODE_Argb]      = WebPYuv444ToArgb_C;
+  WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;

  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@@ -224,17 +224,17 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
  if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;

 #ifdef FANCY_UPSAMPLING
-  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
-  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
-  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
-  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
-  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair_C;
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair_C;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair_C;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair_C;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair_C;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_C;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair_C;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair_C;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair_C;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair_C;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_C;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
--- a/src/dsp/upsampling_msa.c
+++ b/src/dsp/upsampling_msa.c
@@ -374,7 +374,7 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
 static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
                          const uint8_t* v, uint8_t* dst, int length) {
  v16u8 R, G, B;
-  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
  while (length >= 16) {
    CALC_RGB16(y, u, v, R, G, B);
    STORE16_4(R, G, B, A, dst);
@@ -402,7 +402,7 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
 static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
                          const uint8_t* v, uint8_t* dst, int length) {
  v16u8 R, G, B;
-  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
  while (length >= 16) {
    CALC_RGB16(y, u, v, R, G, B);
    STORE16_4(B, G, R, A, dst);
@@ -430,7 +430,7 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
 static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
                          const uint8_t* v, uint8_t* dst, int length) {
  v16u8 R, G, B;
-  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
  while (length >= 16) {
    CALC_RGB16(y, u, v, R, G, B);
    STORE16_4(A, R, G, B, dst);
--- a/src/dsp/upsampling_sse2.c
+++ b/src/dsp/upsampling_sse2.c
@@ -121,10 +121,10 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],

 #define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
                       top_dst, bottom_dst, cur_x) do {                        \
-  FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP);              \
+  FUNC##32_SSE2(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP);         \
  if (bottom_y != NULL) {                                                      \
-    FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64,                           \
-             bottom_dst + (cur_x) * XSTEP);                                    \
+    FUNC##32_SSE2(bottom_y + (cur_x), r_u + 64, r_v + 64,                      \
+                  bottom_dst + (cur_x) * XSTEP);                               \
  }                                                                            \
 } while (0)

@@ -213,29 +213,40 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
 extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 extern void WebPInitYUV444ConvertersSSE2(void);

-#define YUV444_FUNC(FUNC_NAME, CALL, XSTEP) \
-extern void WebP##FUNC_NAME##C(const uint8_t* y, const uint8_t* u,             \
-                               const uint8_t* v, uint8_t* dst, int len);       \
+#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP)                            \
+extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v,       \
+                   uint8_t* dst, int len);                                     \
 static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
                      uint8_t* dst, int len) {                                 \
  int i;                                                                       \
  const int max_len = len & ~31;                                               \
  for (i = 0; i < max_len; i += 32) CALL(y + i, u + i, v + i, dst + i * XSTEP);\
  if (i < len) {  /* C-fallback */                                             \
-    WebP##FUNC_NAME##C(y + i, u + i, v + i, dst + i * XSTEP, len - i);         \
+    CALL_C(y + i, u + i, v + i, dst + i * XSTEP, len - i);                     \
  }                                                                            \
 }

-YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba32, 4);
-YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra32, 4);
-YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb32, 3);
-YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr32, 3);
+YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4);
+YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4);
+YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3);
+YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3);
+YUV444_FUNC(Yuv444ToArgb_SSE2, VP8YuvToArgb32_SSE2, WebPYuv444ToArgb_C, 4)
+YUV444_FUNC(Yuv444ToRgba4444_SSE2, VP8YuvToRgba444432_SSE2, \
+            WebPYuv444ToRgba4444_C, 2)
+YUV444_FUNC(Yuv444ToRgb565_SSE2, VP8YuvToRgb56532_SSE2, WebPYuv444ToRgb565_C, 2)

 WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) {
-  WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
-  WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
-  WebPYUV444Converters[MODE_RGB]  = Yuv444ToRgb;
-  WebPYUV444Converters[MODE_BGR]  = Yuv444ToBgr;
+  WebPYUV444Converters[MODE_RGBA]      = Yuv444ToRgba_SSE2;
+  WebPYUV444Converters[MODE_BGRA]      = Yuv444ToBgra_SSE2;
+  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb_SSE2;
+  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr_SSE2;
+  WebPYUV444Converters[MODE_ARGB]      = Yuv444ToArgb_SSE2;
+  WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444_SSE2;
+  WebPYUV444Converters[MODE_RGB_565]   = Yuv444ToRgb565_SSE2;
+  WebPYUV444Converters[MODE_rgbA]      = Yuv444ToRgba_SSE2;
+  WebPYUV444Converters[MODE_bgrA]      = Yuv444ToBgra_SSE2;
+  WebPYUV444Converters[MODE_Argb]      = Yuv444ToArgb_SSE2;
+  WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444_SSE2;
 }

 #else
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@@ -308,7 +308,9 @@ static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
    (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;

 extern void WebPInitConvertARGBToYUVSSE2(void);
+extern void WebPInitConvertARGBToYUVNEON(void);
 extern void WebPInitSharpYUVSSE2(void);
+extern void WebPInitSharpYUVNEON(void);

 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
  if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
@@ -332,6 +334,13 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
      WebPInitSharpYUVSSE2();
    }
 #endif  // WEBP_USE_SSE2
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      WebPInitConvertARGBToYUVNEON();
+      WebPInitSharpYUVNEON();
+    }
+#endif  // WEBP_USE_NEON
+
  }
  rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/yuv.h
+++ b/src/dsp/yuv.h
@@ -166,20 +166,20 @@ void VP8YUVInit(void);
 #if defined(WEBP_USE_SSE2)

 // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
-void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst);
-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst);
-void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst);
-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst);
-void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst);
-void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                        uint8_t* dst);
-void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                      uint8_t* dst);
+void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst);
+void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
+                             const uint8_t* v, uint8_t* dst);
+void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                           uint8_t* dst);

 #endif    // WEBP_USE_SSE2

--- a/src/dsp/yuv_neon.c
+++ b/src/dsp/yuv_neon.c
@@ -0,0 +1,289 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV->RGB conversion functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./yuv.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "./neon.h"
+
+//-----------------------------------------------------------------------------
+
+static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
+                                    const uint8x8_t G,
+                                    const uint8x8_t B) {
+  const uint16x8_t r = vmovl_u8(R);
+  const uint16x8_t g = vmovl_u8(G);
+  const uint16x8_t b = vmovl_u8(B);
+  const uint16x4_t r_lo = vget_low_u16(r);
+  const uint16x4_t r_hi = vget_high_u16(r);
+  const uint16x4_t g_lo = vget_low_u16(g);
+  const uint16x4_t g_hi = vget_high_u16(g);
+  const uint16x4_t b_lo = vget_low_u16(b);
+  const uint16x4_t b_hi = vget_high_u16(b);
+  const uint32x4_t tmp0_lo = vmull_n_u16(         r_lo, 16839u);
+  const uint32x4_t tmp0_hi = vmull_n_u16(         r_hi, 16839u);
+  const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u);
+  const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u);
+  const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u);
+  const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u);
+  const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16),
+                                     vrshrn_n_u32(tmp2_hi, 16));
+  const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16));
+  return vqmovn_u16(Y2);
+}
+
+static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
+  int i;
+  for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
+    const uint8x8x3_t RGB = vld3_u8(rgb);
+    const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]);
+    vst1_u8(y + i, Y);
+  }
+  for (; i < width; ++i, rgb += 3) {   // left-over
+    y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
+  }
+}
+
+static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
+  int i;
+  for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
+    const uint8x8x3_t BGR = vld3_u8(bgr);
+    const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]);
+    vst1_u8(y + i, Y);
+  }
+  for (; i < width; ++i, bgr += 3) {  // left-over
+    y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
+  }
+}
+
+static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
+  int i;
+  for (i = 0; i + 8 <= width; i += 8) {
+    const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
+    const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]);
+    vst1_u8(y + i, Y);
+  }
+  for (; i < width; ++i) {   // left-over
+    const uint32_t p = argb[i];
+    y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
+                     YUV_HALF);
+  }
+}
+
+//-----------------------------------------------------------------------------
+
+// computes: DST_s16 = [(C0 * r + C1 * g + C2 * b) >> 16] + CST
+#define MULTIPLY_16b_PREAMBLE(r, g, b)                           \
+  const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r));  \
+  const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \
+  const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g));  \
+  const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \
+  const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b));  \
+  const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b))
+
+#define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do {              \
+  const int32x4_t tmp0_lo = vmull_n_s16(         r_lo, C0);      \
+  const int32x4_t tmp0_hi = vmull_n_s16(         r_hi, C0);      \
+  const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1);      \
+  const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1);      \
+  const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2);      \
+  const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2);      \
+  const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16),  \
+                                      vshrn_n_s32(tmp2_hi, 16)); \
+  DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST));                   \
+} while (0)
+
+// This needs to be a macro, since (128 << SHIFT) needs to be an immediate.
+#define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do {     \
+  MULTIPLY_16b_PREAMBLE(r, g, b);                                \
+  MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST);       \
+  MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST);       \
+} while (0)
+
+static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
+                                   uint8_t* u, uint8_t* v, int width) {
+  int i;
+  for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
+    const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
+    int16x8_t U, V;
+    CONVERT_RGB_TO_UV(RGB.val[0], RGB.val[1], RGB.val[2], 2, U, V);
+    vst1_u8(u + i, vqrshrun_n_s16(U, 2));
+    vst1_u8(v + i, vqrshrun_n_s16(V, 2));
+  }
+  for (; i < width; i += 1, rgb += 4) {
+    const int r = rgb[0], g = rgb[1], b = rgb[2];
+    u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
+    v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
+  }
+}
+
+static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
+                                 int src_width, int do_store) {
+  int i;
+  for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
+    const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
+    const uint16x8_t R = vpaddlq_u8(RGB.val[2]);  // pair-wise adds
+    const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
+    const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
+    int16x8_t U_tmp, V_tmp;
+    CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
+    {
+      const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
+      const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
+      if (do_store) {
+        vst1_u8(u, U);
+        vst1_u8(v, V);
+      } else {
+        const uint8x8_t prev_u = vld1_u8(u);
+        const uint8x8_t prev_v = vld1_u8(v);
+        vst1_u8(u, vrhadd_u8(U, prev_u));
+        vst1_u8(v, vrhadd_u8(V, prev_v));
+      }
+    }
+  }
+  if (i < src_width) {  // left-over
+    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
+  }
+}
+
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitConvertARGBToYUVNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
+  WebPConvertRGB24ToY = ConvertRGB24ToY_NEON;
+  WebPConvertBGR24ToY = ConvertBGR24ToY_NEON;
+  WebPConvertARGBToY = ConvertARGBToY_NEON;
+  WebPConvertARGBToUV = ConvertARGBToUV_NEON;
+  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
+}
+
+//------------------------------------------------------------------------------
+
+#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
+static uint16_t clip_y(int v) {
+  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
+}
+
+static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
+                                     uint16_t* dst, int len) {
+  int i;
+  const int16x8_t zero = vdupq_n_s16(0);
+  const int16x8_t max = vdupq_n_s16(MAX_Y);
+  uint64x2_t sum = vdupq_n_u64(0);
+  uint64_t diff;
+
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
+    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
+    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
+    const int16x8_t D = vsubq_s16(A, B);       // diff_y
+    const int16x8_t F = vaddq_s16(C, D);       // new_y
+    const uint16x8_t H =
+        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
+    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
+    vst1q_u16(dst + i, H);
+    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
+  }
+  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
+  for (; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)(dst[i]) + diff_y;
+    dst[i] = clip_y(new_y);
+    diff += (uint64_t)(abs(diff_y));
+  }
+  return diff;
+}
+
+static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
+                                   int16_t* dst, int len) {
+  int i;
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vld1q_s16(ref + i);
+    const int16x8_t B = vld1q_s16(src + i);
+    const int16x8_t C = vld1q_s16(dst + i);
+    const int16x8_t D = vsubq_s16(A, B);   // diff_uv
+    const int16x8_t E = vaddq_s16(C, D);   // new_uv
+    vst1q_s16(dst + i, E);
+  }
+  for (; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
+                                   const uint16_t* best_y, uint16_t* out) {
+  int i;
+  const int16x8_t max = vdupq_n_s16(MAX_Y);
+  const int16x8_t zero = vdupq_n_s16(0);
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t a0 = vld1q_s16(A + i + 0);
+    const int16x8_t a1 = vld1q_s16(A + i + 1);
+    const int16x8_t b0 = vld1q_s16(B + i + 0);
+    const int16x8_t b1 = vld1q_s16(B + i + 1);
+    const int16x8_t a0b1 = vaddq_s16(a0, b1);
+    const int16x8_t a1b0 = vaddq_s16(a1, b0);
+    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
+    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
+    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
+    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
+    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
+    const int16x8_t d0 = vaddq_s16(c1, a0);
+    const int16x8_t d1 = vaddq_s16(c0, a1);
+    const int16x8_t e0 = vrshrq_n_s16(d0, 1);
+    const int16x8_t e1 = vrshrq_n_s16(d1, 1);
+    const int16x8x2_t f = vzipq_s16(e0, e1);
+    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
+    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
+    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
+    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
+    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
+    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
+    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
+    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
+  }
+  for (; i < len; ++i) {
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
+    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
+  }
+}
+#undef MAX_Y
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitSharpYUVNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
+  WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
+  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
+  WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(WebPInitSamplersNEON)
+WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
+WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
+
+#endif  // WEBP_USE_NEON
--- a/src/dsp/yuv_sse2.c
+++ b/src/dsp/yuv_sse2.c
@@ -186,8 +186,8 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
  _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
 }

-void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
+void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n < 32; n += 8, dst += 32) {
@@ -197,8 +197,8 @@ void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  }
 }

-void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
+void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n < 32; n += 8, dst += 32) {
@@ -208,8 +208,8 @@ void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  }
 }

-void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
+void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n < 32; n += 8, dst += 32) {
@@ -219,8 +219,8 @@ void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  }
 }

-void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst) {
+void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
+                             const uint8_t* v, uint8_t* dst) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n < 32; n += 8, dst += 16) {
@@ -230,8 +230,8 @@ void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  }
 }

-void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                      uint8_t* dst) {
+void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                           uint8_t* dst) {
  int n;
  for (n = 0; n < 32; n += 8, dst += 16) {
    __m128i R, G, B;
@@ -240,8 +240,8 @@ void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  }
 }

-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
+void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst) {
  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
  __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;

@@ -262,8 +262,8 @@ void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 }

-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
+void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst) {
  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
  __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;

--- a/src/enc/Makefile.am
+++ b/src/enc/Makefile.am
@@ -3,6 +3,7 @@ noinst_LTLIBRARIES = libwebpencode.la
 libwebpencode_la_SOURCES =
 libwebpencode_la_SOURCES += alpha_enc.c
 libwebpencode_la_SOURCES += analysis_enc.c
+libwebpencode_la_SOURCES += backward_references_cost_enc.c
 libwebpencode_la_SOURCES += backward_references_enc.c
 libwebpencode_la_SOURCES += backward_references_enc.h
 libwebpencode_la_SOURCES += config_enc.c
--- a/src/enc/backward_references_cost_enc.c
+++ b/src/enc/backward_references_cost_enc.c
@@ -0,0 +1,790 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Improves a given set of backward references by analyzing its bit cost.
+// The algorithm is similar to the Zopfli compression algorithm but tailored to
+// images.
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+//
+
+#include <assert.h>
+
+#include "./backward_references_enc.h"
+#include "./histogram_enc.h"
+#include "../dsp/lossless_common.h"
+#include "../utils/color_cache_utils.h"
+#include "../utils/utils.h"
+
+#define VALUES_IN_BYTE 256
+
+extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+extern int VP8LDistanceToPlaneCode(int xsize, int dist);
+extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                                      const PixOrCopy v);
+
+typedef struct {
+  double alpha_[VALUES_IN_BYTE];
+  double red_[VALUES_IN_BYTE];
+  double blue_[VALUES_IN_BYTE];
+  double distance_[NUM_DISTANCE_CODES];
+  double* literal_;
+} CostModel;
+
+static void ConvertPopulationCountTableToBitEstimates(
+    int num_symbols, const uint32_t population_counts[], double output[]) {
+  uint32_t sum = 0;
+  int nonzeros = 0;
+  int i;
+  for (i = 0; i < num_symbols; ++i) {
+    sum += population_counts[i];
+    if (population_counts[i] > 0) {
+      ++nonzeros;
+    }
+  }
+  if (nonzeros <= 1) {
+    memset(output, 0, num_symbols * sizeof(*output));
+  } else {
+    const double logsum = VP8LFastLog2(sum);
+    for (i = 0; i < num_symbols; ++i) {
+      output[i] = logsum - VP8LFastLog2(population_counts[i]);
+    }
+  }
+}
+
+static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
+                          const VP8LBackwardRefs* const refs) {
+  int ok = 0;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
+  if (histo == NULL) goto Error;
+
+  // The following code is similar to VP8LHistogramCreate but converts the
+  // distance to plane code.
+  VP8LHistogramInit(histo, cache_bits);
+  while (VP8LRefsCursorOk(&c)) {
+    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, VP8LDistanceToPlaneCode,
+                                    xsize);
+    VP8LRefsCursorNext(&c);
+  }
+
+  ConvertPopulationCountTableToBitEstimates(
+      VP8LHistogramNumCodes(histo->palette_code_bits_),
+      histo->literal_, m->literal_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->red_, m->red_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->blue_, m->blue_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->alpha_, m->alpha_);
+  ConvertPopulationCountTableToBitEstimates(
+      NUM_DISTANCE_CODES, histo->distance_, m->distance_);
+  ok = 1;
+
+ Error:
+  VP8LFreeHistogram(histo);
+  return ok;
+}
+
+static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
+  return m->alpha_[v >> 24] +
+         m->red_[(v >> 16) & 0xff] +
+         m->literal_[(v >> 8) & 0xff] +
+         m->blue_[v & 0xff];
+}
+
+static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
+  const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
+  return m->literal_[literal_idx];
+}
+
+static WEBP_INLINE double GetLengthCost(const CostModel* const m,
+                                        uint32_t length) {
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(length, &code, &extra_bits);
+  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
+}
+
+static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
+                                          uint32_t distance) {
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
+  return m->distance_[code] + extra_bits;
+}
+
+static WEBP_INLINE void AddSingleLiteralWithCostModel(
+    const uint32_t* const argb, VP8LColorCache* const hashers,
+    const CostModel* const cost_model, int idx, int use_color_cache,
+    float prev_cost, float* const cost, uint16_t* const dist_array) {
+  double cost_val = prev_cost;
+  const uint32_t color = argb[idx];
+  const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
+  if (ix >= 0) {
+    // use_color_cache is true and hashers contains color
+    const double mul0 = 0.68;
+    cost_val += GetCacheCost(cost_model, ix) * mul0;
+  } else {
+    const double mul1 = 0.82;
+    if (use_color_cache) VP8LColorCacheInsert(hashers, color);
+    cost_val += GetLiteralCost(cost_model, color) * mul1;
+  }
+  if (cost[idx] > cost_val) {
+    cost[idx] = (float)cost_val;
+    dist_array[idx] = 1;  // only one is inserted.
+  }
+}
+
+// -----------------------------------------------------------------------------
+// CostManager and interval handling
+
+// Empirical value to avoid high memory consumption but good for performance.
+#define COST_CACHE_INTERVAL_SIZE_MAX 500
+
+// To perform backward reference every pixel at index index_ is considered and
+// the cost for the MAX_LENGTH following pixels computed. Those following pixels
+// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
+//     cost_ = distance cost at index + GetLengthCost(cost_model, k)
+// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
+// array of size MAX_LENGTH.
+// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
+// minimal values using intervals of constant cost.
+// An interval is defined by the index_ of the pixel that generated it and
+// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
+// it contains the minimum value for pixels between start_ and end_.
+// Intervals are stored in a linked list and ordered by start_. When a new
+// interval has a better value, old intervals are split or removed. There are
+// therefore no overlapping intervals.
+typedef struct CostInterval CostInterval;
+struct CostInterval {
+  float cost_;
+  int start_;
+  int end_;
+  int index_;
+  CostInterval* previous_;
+  CostInterval* next_;
+};
+
+// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
+typedef struct {
+  double cost_;
+  int start_;
+  int end_;       // Exclusive.
+} CostCacheInterval;
+
+// This structure is in charge of managing intervals and costs.
+// It caches the different CostCacheInterval, caches the different
+// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
+// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
+#define COST_MANAGER_MAX_FREE_LIST 10
+typedef struct {
+  CostInterval* head_;
+  int count_;  // The number of stored intervals.
+  CostCacheInterval* cache_intervals_;
+  size_t cache_intervals_size_;
+  double cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
+  float* costs_;
+  uint16_t* dist_array_;
+  // Most of the time, we only need few intervals -> use a free-list, to avoid
+  // fragmentation with small allocs in most common cases.
+  CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
+  CostInterval* free_intervals_;
+  // These are regularly malloc'd remains. This list can't grow larger than than
+  // size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
+  CostInterval* recycled_intervals_;
+} CostManager;
+
+static void CostIntervalAddToFreeList(CostManager* const manager,
+                                      CostInterval* const interval) {
+  interval->next_ = manager->free_intervals_;
+  manager->free_intervals_ = interval;
+}
+
+static int CostIntervalIsInFreeList(const CostManager* const manager,
+                                    const CostInterval* const interval) {
+  return (interval >= &manager->intervals_[0] &&
+          interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
+}
+
+static void CostManagerInitFreeList(CostManager* const manager) {
+  int i;
+  manager->free_intervals_ = NULL;
+  for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
+    CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
+  }
+}
+
+static void DeleteIntervalList(CostManager* const manager,
+                               const CostInterval* interval) {
+  while (interval != NULL) {
+    const CostInterval* const next = interval->next_;
+    if (!CostIntervalIsInFreeList(manager, interval)) {
+      WebPSafeFree((void*)interval);
+    }  // else: do nothing
+    interval = next;
+  }
+}
+
+static void CostManagerClear(CostManager* const manager) {
+  if (manager == NULL) return;
+
+  WebPSafeFree(manager->costs_);
+  WebPSafeFree(manager->cache_intervals_);
+
+  // Clear the interval lists.
+  DeleteIntervalList(manager, manager->head_);
+  manager->head_ = NULL;
+  DeleteIntervalList(manager, manager->recycled_intervals_);
+  manager->recycled_intervals_ = NULL;
+
+  // Reset pointers, count_ and cache_intervals_size_.
+  memset(manager, 0, sizeof(*manager));
+  CostManagerInitFreeList(manager);
+}
+
+static int CostManagerInit(CostManager* const manager,
+                           uint16_t* const dist_array, int pix_count,
+                           const CostModel* const cost_model) {
+  int i;
+  const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
+
+  manager->costs_ = NULL;
+  manager->cache_intervals_ = NULL;
+  manager->head_ = NULL;
+  manager->recycled_intervals_ = NULL;
+  manager->count_ = 0;
+  manager->dist_array_ = dist_array;
+  CostManagerInitFreeList(manager);
+
+  // Fill in the cost_cache_.
+  manager->cache_intervals_size_ = 1;
+  manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
+  for (i = 1; i < cost_cache_size; ++i) {
+    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
+    // Get the number of bound intervals.
+    if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
+      ++manager->cache_intervals_size_;
+    }
+  }
+
+  // With the current cost model, we usually have below 20 intervals.
+  // The worst case scenario with a cost model would be if every length has a
+  // different cost, hence MAX_LENGTH but that is impossible with the current
+  // implementation that spirals around a pixel.
+  assert(manager->cache_intervals_size_ <= MAX_LENGTH);
+  manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
+      manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
+  if (manager->cache_intervals_ == NULL) {
+    CostManagerClear(manager);
+    return 0;
+  }
+
+  // Fill in the cache_intervals_.
+  {
+    CostCacheInterval* cur = manager->cache_intervals_;
+
+    // Consecutive values in cost_cache_ are compared and if a big enough
+    // difference is found, a new interval is created and bounded.
+    cur->start_ = 0;
+    cur->end_ = 1;
+    cur->cost_ = manager->cost_cache_[0];
+    for (i = 1; i < cost_cache_size; ++i) {
+      const double cost_val = manager->cost_cache_[i];
+      if (cost_val != cur->cost_) {
+        ++cur;
+        // Initialize an interval.
+        cur->start_ = i;
+        cur->cost_ = cost_val;
+      }
+      cur->end_ = i + 1;
+    }
+  }
+
+  manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
+  if (manager->costs_ == NULL) {
+    CostManagerClear(manager);
+    return 0;
+  }
+  // Set the initial costs_ high for every pixel as we will keep the minimum.
+  for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
+
+  return 1;
+}
+
+// Given the cost and the position that define an interval, update the cost at
+// pixel 'i' if it is smaller than the previously computed value.
+static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
+                                   int position, float cost) {
+  const int k = i - position;
+  assert(k >= 0 && k < MAX_LENGTH);
+
+  if (manager->costs_[i] > cost) {
+    manager->costs_[i] = cost;
+    manager->dist_array_[i] = k + 1;
+  }
+}
+
+// Given the cost and the position that define an interval, update the cost for
+// all the pixels between 'start' and 'end' excluded.
+static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
+                                              int start, int end, int position,
+                                              float cost) {
+  int i;
+  for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
+}
+
+// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
+static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
+                                         CostInterval* const prev,
+                                         CostInterval* const next) {
+  if (prev != NULL) {
+    prev->next_ = next;
+  } else {
+    manager->head_ = next;
+  }
+
+  if (next != NULL) next->previous_ = prev;
+}
+
+// Pop an interval in the manager.
+static WEBP_INLINE void PopInterval(CostManager* const manager,
+                                    CostInterval* const interval) {
+  if (interval == NULL) return;
+
+  ConnectIntervals(manager, interval->previous_, interval->next_);
+  if (CostIntervalIsInFreeList(manager, interval)) {
+    CostIntervalAddToFreeList(manager, interval);
+  } else {  // recycle regularly malloc'd intervals too
+    interval->next_ = manager->recycled_intervals_;
+    manager->recycled_intervals_ = interval;
+  }
+  --manager->count_;
+  assert(manager->count_ >= 0);
+}
+
+// Update the cost at index i by going over all the stored intervals that
+// overlap with i.
+// If 'do_clean_intervals' is set to something different than 0, intervals that
+// end before 'i' will be popped.
+static WEBP_INLINE void UpdateCostAtIndex(CostManager* const manager, int i,
+                                          int do_clean_intervals) {
+  CostInterval* current = manager->head_;
+
+  while (current != NULL && current->start_ <= i) {
+    CostInterval* const next = current->next_;
+    if (current->end_ <= i) {
+      if (do_clean_intervals) {
+        // We have an outdated interval, remove it.
+        PopInterval(manager, current);
+      }
+    } else {
+      UpdateCost(manager, i, current->index_, current->cost_);
+    }
+    current = next;
+  }
+}
+
+// Given a current orphan interval and its previous interval, before
+// it was orphaned (which can be NULL), set it at the right place in the list
+// of intervals using the start_ ordering and the previous interval as a hint.
+static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
+                                               CostInterval* const current,
+                                               CostInterval* previous) {
+  assert(current != NULL);
+
+  if (previous == NULL) previous = manager->head_;
+  while (previous != NULL && current->start_ < previous->start_) {
+    previous = previous->previous_;
+  }
+  while (previous != NULL && previous->next_ != NULL &&
+         previous->next_->start_ < current->start_) {
+    previous = previous->next_;
+  }
+
+  if (previous != NULL) {
+    ConnectIntervals(manager, current, previous->next_);
+  } else {
+    ConnectIntervals(manager, current, manager->head_);
+  }
+  ConnectIntervals(manager, previous, current);
+}
+
+// Insert an interval in the list contained in the manager by starting at
+// interval_in as a hint. The intervals are sorted by start_ value.
+static WEBP_INLINE void InsertInterval(CostManager* const manager,
+                                       CostInterval* const interval_in,
+                                       float cost, int position, int start,
+                                       int end) {
+  CostInterval* interval_new;
+
+  if (start >= end) return;
+  if (manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
+    // Serialize the interval if we cannot store it.
+    UpdateCostPerInterval(manager, start, end, position, cost);
+    return;
+  }
+  if (manager->free_intervals_ != NULL) {
+    interval_new = manager->free_intervals_;
+    manager->free_intervals_ = interval_new->next_;
+  } else if (manager->recycled_intervals_ != NULL) {
+    interval_new = manager->recycled_intervals_;
+    manager->recycled_intervals_ = interval_new->next_;
+  } else {  // malloc for good
+    interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
+    if (interval_new == NULL) {
+      // Write down the interval if we cannot create it.
+      UpdateCostPerInterval(manager, start, end, position, cost);
+      return;
+    }
+  }
+
+  interval_new->cost_ = cost;
+  interval_new->index_ = position;
+  interval_new->start_ = start;
+  interval_new->end_ = end;
+  PositionOrphanInterval(manager, interval_new, interval_in);
+
+  ++manager->count_;
+}
+
+// Given a new cost interval defined by its start at position, its length value
+// and distance_cost, add its contributions to the previous intervals and costs.
+// If handling the interval or one of its subintervals becomes to heavy, its
+// contribution is added to the costs right away.
+static WEBP_INLINE void PushInterval(CostManager* const manager,
+                                     double distance_cost, int position,
+                                     int len) {
+  size_t i;
+  CostInterval* interval = manager->head_;
+  CostInterval* interval_next;
+  const CostCacheInterval* const cost_cache_intervals =
+      manager->cache_intervals_;
+  // If the interval is small enough, no need to deal with the heavy
+  // interval logic, just serialize it right away. This constant is empirical.
+  const int kSkipDistance = 10;
+
+  if (len < kSkipDistance) {
+    int j;
+    for (j = position; j < position + len; ++j) {
+      const int k = j - position;
+      float cost_tmp;
+      assert(k >= 0 && k < MAX_LENGTH);
+      cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
+
+      if (manager->costs_[j] > cost_tmp) {
+        manager->costs_[j] = cost_tmp;
+        manager->dist_array_[j] = k + 1;
+      }
+    }
+    return;
+  }
+
+  for (i = 0; i < manager->cache_intervals_size_ &&
+              cost_cache_intervals[i].start_ < len;
+       ++i) {
+    // Define the intersection of the ith interval with the new one.
+    int start = position + cost_cache_intervals[i].start_;
+    const int end = position + (cost_cache_intervals[i].end_ > len
+                                 ? len
+                                 : cost_cache_intervals[i].end_);
+    const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
+
+    for (; interval != NULL && interval->start_ < end;
+         interval = interval_next) {
+      interval_next = interval->next_;
+
+      // Make sure we have some overlap
+      if (start >= interval->end_) continue;
+
+      if (cost >= interval->cost_) {
+        // When intervals are represented, the lower, the better.
+        // [**********************************************************[
+        // start                                                    end
+        //                   [----------------------------------[
+        //                   interval->start_       interval->end_
+        // If we are worse than what we already have, add whatever we have so
+        // far up to interval.
+        const int start_new = interval->end_;
+        InsertInterval(manager, interval, cost, position, start,
+                       interval->start_);
+        start = start_new;
+        if (start >= end) break;
+        continue;
+      }
+
+      if (start <= interval->start_) {
+        if (interval->end_ <= end) {
+          //                   [----------------------------------[
+          //                   interval->start_       interval->end_
+          // [**************************************************************[
+          // start                                                        end
+          // We can safely remove the old interval as it is fully included.
+          PopInterval(manager, interval);
+        } else {
+          //              [------------------------------------[
+          //              interval->start_        interval->end_
+          // [*****************************[
+          // start                       end
+          interval->start_ = end;
+          break;
+        }
+      } else {
+        if (end < interval->end_) {
+          // [--------------------------------------------------------------[
+          // interval->start_                                  interval->end_
+          //                     [*****************************[
+          //                     start                       end
+          // We have to split the old interval as it fully contains the new one.
+          const int end_original = interval->end_;
+          interval->end_ = start;
+          InsertInterval(manager, interval, interval->cost_, interval->index_,
+                         end, end_original);
+          interval = interval->next_;
+          break;
+        } else {
+          // [------------------------------------[
+          // interval->start_        interval->end_
+          //                     [*****************************[
+          //                     start                       end
+          interval->end_ = start;
+        }
+      }
+    }
+    // Insert the remaining interval from start to end.
+    InsertInterval(manager, interval, cost, position, start, end);
+  }
+}
+
+static int BackwardReferencesHashChainDistanceOnly(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain, const VP8LBackwardRefs* const refs,
+    uint16_t* const dist_array) {
+  int i;
+  int ok = 0;
+  int cc_init = 0;
+  const int pix_count = xsize * ysize;
+  const int use_color_cache = (cache_bits > 0);
+  const size_t literal_array_size =
+      sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
+                        ((cache_bits > 0) ? (1 << cache_bits) : 0));
+  const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
+  CostModel* const cost_model =
+      (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
+  VP8LColorCache hashers;
+  CostManager* cost_manager =
+      (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
+  int offset_prev = -1, len_prev = -1;
+  double offset_cost = -1;
+  int first_offset_is_constant = -1;  // initialized with 'impossible' value
+  int reach = 0;
+
+  if (cost_model == NULL || cost_manager == NULL) goto Error;
+
+  cost_model->literal_ = (double*)(cost_model + 1);
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  if (!CostModelBuild(cost_model, xsize, cache_bits, refs)) {
+    goto Error;
+  }
+
+  if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
+    goto Error;
+  }
+
+  // We loop one pixel at a time, but store all currently best points to
+  // non-processed locations from this point.
+  dist_array[0] = 0;
+  // Add first pixel as literal.
+  AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache,
+                                0.f, cost_manager->costs_, dist_array);
+
+  for (i = 1; i < pix_count; ++i) {
+    const float prev_cost = cost_manager->costs_[i - 1];
+    int offset, len;
+    VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
+
+    // Try adding the pixel as a literal.
+    AddSingleLiteralWithCostModel(argb, &hashers, cost_model, i,
+                                  use_color_cache, prev_cost,
+                                  cost_manager->costs_, dist_array);
+
+    // If we are dealing with a non-literal.
+    if (len >= 2) {
+      if (offset != offset_prev) {
+        const int code = VP8LDistanceToPlaneCode(xsize, offset);
+        offset_cost = GetDistanceCost(cost_model, code);
+        first_offset_is_constant = 1;
+        PushInterval(cost_manager, prev_cost + offset_cost, i, len);
+      } else {
+        assert(offset_cost >= 0);
+        assert(len_prev >= 0);
+        assert(first_offset_is_constant == 0 || first_offset_is_constant == 1);
+        // Instead of considering all contributions from a pixel i by calling:
+        //         PushInterval(cost_manager, prev_cost + offset_cost, i, len);
+        // we optimize these contributions in case offset_cost stays the same
+        // for consecutive pixels. This describes a set of pixels similar to a
+        // previous set (e.g. constant color regions).
+        if (first_offset_is_constant) {
+          reach = i - 1 + len_prev - 1;
+          first_offset_is_constant = 0;
+        }
+
+        if (i + len - 1 > reach) {
+          // We can only be go further with the same offset if the previous
+          // length was maxed, hence len_prev == len == MAX_LENGTH.
+          // TODO(vrabaud), bump i to the end right away (insert cache and
+          // update cost).
+          // TODO(vrabaud), check if one of the points in between does not have
+          // a lower cost.
+          // Already consider the pixel at "reach" to add intervals that are
+          // better than whatever we add.
+          int offset_j, len_j = 0;
+          int j;
+          assert(len == MAX_LENGTH || len == pix_count - i);
+          // Figure out the last consecutive pixel within [i, reach + 1] with
+          // the same offset.
+          for (j = i; j <= reach; ++j) {
+            VP8LHashChainFindCopy(hash_chain, j + 1, &offset_j, &len_j);
+            if (offset_j != offset) {
+              VP8LHashChainFindCopy(hash_chain, j, &offset_j, &len_j);
+              break;
+            }
+          }
+          // Update the cost at j - 1 and j.
+          UpdateCostAtIndex(cost_manager, j - 1, 0);
+          UpdateCostAtIndex(cost_manager, j, 0);
+
+          PushInterval(cost_manager, cost_manager->costs_[j - 1] + offset_cost,
+                       j, len_j);
+          reach = j + len_j - 1;
+        }
+      }
+    }
+
+    UpdateCostAtIndex(cost_manager, i, 1);
+    offset_prev = offset;
+    len_prev = len;
+  }
+
+  ok = !refs->error_;
+Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  CostManagerClear(cost_manager);
+  WebPSafeFree(cost_model);
+  WebPSafeFree(cost_manager);
+  return ok;
+}
+
+// We pack the path at the end of *dist_array and return
+// a pointer to this part of the array. Example:
+// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
+static void TraceBackwards(uint16_t* const dist_array,
+                           int dist_array_size,
+                           uint16_t** const chosen_path,
+                           int* const chosen_path_size) {
+  uint16_t* path = dist_array + dist_array_size;
+  uint16_t* cur = dist_array + dist_array_size - 1;
+  while (cur >= dist_array) {
+    const int k = *cur;
+    --path;
+    *path = k;
+    cur -= k;
+  }
+  *chosen_path = path;
+  *chosen_path_size = (int)(dist_array + dist_array_size - path);
+}
+
+static int BackwardReferencesHashChainFollowChosenPath(
+    const uint32_t* const argb, int cache_bits,
+    const uint16_t* const chosen_path, int chosen_path_size,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
+  const int use_color_cache = (cache_bits > 0);
+  int ix;
+  int i = 0;
+  int ok = 0;
+  int cc_init = 0;
+  VP8LColorCache hashers;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  VP8LClearBackwardRefs(refs);
+  for (ix = 0; ix < chosen_path_size; ++ix) {
+    const int len = chosen_path[ix];
+    if (len != 1) {
+      int k;
+      const int offset = VP8LHashChainFindOffset(hash_chain, i);
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
+      if (use_color_cache) {
+        for (k = 0; k < len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      i += len;
+    } else {
+      PixOrCopy v;
+      const int idx =
+          use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
+      if (idx >= 0) {
+        // use_color_cache is true and hashers contains argb[i]
+        // push pixel as a color cache index
+        v = PixOrCopyCreateCacheIdx(idx);
+      } else {
+        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
+        v = PixOrCopyCreateLiteral(argb[i]);
+      }
+      VP8LBackwardRefsCursorAdd(refs, v);
+      ++i;
+    }
+  }
+  ok = !refs->error_;
+ Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  return ok;
+}
+
+// Returns 1 on success.
+extern int VP8LBackwardReferencesTraceBackwards(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain,
+    const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
+int VP8LBackwardReferencesTraceBackwards(int xsize, int ysize,
+                                         const uint32_t* const argb,
+                                         int cache_bits,
+                                         const VP8LHashChain* const hash_chain,
+                                         const VP8LBackwardRefs* const refs_src,
+                                         VP8LBackwardRefs* const refs_dst) {
+  int ok = 0;
+  const int dist_array_size = xsize * ysize;
+  uint16_t* chosen_path = NULL;
+  int chosen_path_size = 0;
+  uint16_t* dist_array =
+      (uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
+
+  if (dist_array == NULL) goto Error;
+
+  if (!BackwardReferencesHashChainDistanceOnly(
+          xsize, ysize, argb, cache_bits, hash_chain, refs_src, dist_array)) {
+    goto Error;
+  }
+  TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
+  if (!BackwardReferencesHashChainFollowChosenPath(
+          argb, cache_bits, chosen_path, chosen_path_size, hash_chain,
+          refs_dst)) {
+    goto Error;
+  }
+  ok = 1;
+ Error:
+  WebPSafeFree(dist_array);
+  return ok;
+}
--- a/src/enc/backward_references_enc.c
+++ b/src/enc/backward_references_enc.c
--- a/src/enc/backward_references_enc.h
+++ b/src/enc/backward_references_enc.h
@@ -113,6 +113,15 @@ static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) {
 #define HASH_BITS 18
 #define HASH_SIZE (1 << HASH_BITS)

+// If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it
+// is used in VP8LHashChain.
+#define MAX_LENGTH_BITS 12
+// We want the max value to be attainable and stored in MAX_LENGTH_BITS bits.
+#define MAX_LENGTH ((1 << MAX_LENGTH_BITS) - 1)
+#if MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32
+#error "MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32"
+#endif
+
 typedef struct VP8LHashChain VP8LHashChain;
 struct VP8LHashChain {
  // The 20 most significant bits contain the offset at which the best match
@@ -134,6 +143,24 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
                      int low_effort);
 void VP8LHashChainClear(VP8LHashChain* const p);  // release memory

+static WEBP_INLINE int VP8LHashChainFindOffset(const VP8LHashChain* const p,
+                                               const int base_position) {
+  return p->offset_length_[base_position] >> MAX_LENGTH_BITS;
+}
+
+static WEBP_INLINE int VP8LHashChainFindLength(const VP8LHashChain* const p,
+                                               const int base_position) {
+  return p->offset_length_[base_position] & ((1U << MAX_LENGTH_BITS) - 1);
+}
+
+static WEBP_INLINE void VP8LHashChainFindCopy(const VP8LHashChain* const p,
+                                              int base_position,
+                                              int* const offset_ptr,
+                                              int* const length_ptr) {
+  *offset_ptr = VP8LHashChainFindOffset(p, base_position);
+  *length_ptr = VP8LHashChainFindLength(p, base_position);
+}
+
 // -----------------------------------------------------------------------------
 // VP8LBackwardRefs (block-based backward-references storage)

@@ -158,9 +185,6 @@ struct VP8LBackwardRefs {
 void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size);
 // Release memory for backward references.
 void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs);
-// Copies the 'src' backward refs to the 'dst'. Returns 0 in case of error.
-int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
-                         VP8LBackwardRefs* const dst);

 // Cursor for iterating on references content
 typedef struct {
@@ -189,6 +213,12 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
 // -----------------------------------------------------------------------------
 // Main entry points

+enum VP8LLZ77Type {
+  kLZ77Standard = 1,
+  kLZ77RLE = 2,
+  kLZ77Box = 4
+};
+
 // Evaluates best possible backward references for specified quality.
 // The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
 // bits to use (passing 0 implies disabling the local color cache).
@@ -197,8 +227,9 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
 // refs[0] or refs[1].
 VP8LBackwardRefs* VP8LGetBackwardReferences(
    int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs refs[2]);
+    int low_effort, int lz77_types_to_try, int* const cache_bits,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
+    VP8LBackwardRefs* const refs_tmp2);

 #ifdef __cplusplus
 }
--- a/src/enc/histogram_enc.c
+++ b/src/enc/histogram_enc.c
@@ -76,7 +76,7 @@ void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
                            VP8LHistogram* const histo) {
  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
  while (VP8LRefsCursorOk(&c)) {
-    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos);
+    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, NULL, 0);
    VP8LRefsCursorNext(&c);
  }
 }
@@ -138,7 +138,9 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
 // -----------------------------------------------------------------------------

 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
-                                     const PixOrCopy* const v) {
+                                     const PixOrCopy* const v,
+                                     int (*const distance_modifier)(int, int),
+                                     int distance_modifier_arg0) {
  if (PixOrCopyIsLiteral(v)) {
    ++histo->alpha_[PixOrCopyLiteral(v, 3)];
    ++histo->red_[PixOrCopyLiteral(v, 2)];
@@ -152,7 +154,13 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
    int code, extra_bits;
    VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
    ++histo->literal_[NUM_LITERAL_CODES + code];
-    VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+    if (distance_modifier == NULL) {
+      VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+    } else {
+      VP8LPrefixEncodeBits(
+          distance_modifier(distance_modifier_arg0, PixOrCopyDistance(v)),
+          &code, &extra_bits);
+    }
    ++histo->distance_[code];
  }
 }
@@ -473,7 +481,7 @@ static void HistogramBuild(
  while (VP8LRefsCursorOk(&c)) {
    const PixOrCopy* const v = c.cur_pos;
    const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
-    VP8LHistogramAddSinglePixOrCopy(histograms[ix], v);
+    VP8LHistogramAddSinglePixOrCopy(histograms[ix], v, NULL, 0);
    x += PixOrCopyLength(v);
    while (x >= xsize) {
      x -= xsize;
@@ -523,11 +531,12 @@ static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,

 // Compact image_histo[] by merging some histograms with same bin_id together if
 // it's advantageous.
-static VP8LHistogram* HistogramCombineEntropyBin(
-    VP8LHistogramSet* const image_histo,
-    VP8LHistogram* cur_combo,
-    const uint16_t* const bin_map, int bin_map_size, int num_bins,
-    double combine_cost_factor, int low_effort) {
+static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
+                                       VP8LHistogram* cur_combo,
+                                       const uint16_t* const bin_map,
+                                       int bin_map_size, int num_bins,
+                                       double combine_cost_factor,
+                                       int low_effort) {
  VP8LHistogram** const histograms = image_histo->histograms;
  int idx;
  // Work in-place: processed histograms are put at the beginning of
@@ -593,14 +602,13 @@ static VP8LHistogram* HistogramCombineEntropyBin(
      UpdateHistogramCost(histograms[idx]);
    }
  }
-  return cur_combo;
 }

+// Implement a Lehmer random number generator with a multiplicative constant of
+// 48271 and a modulo constant of 2^31 − 1.
 static uint32_t MyRand(uint32_t* const seed) {
-  *seed = (*seed * 16807ull) & 0xffffffffu;
-  if (*seed == 0) {
-    *seed = 1;
-  }
+  *seed = (uint32_t)(((uint64_t)(*seed) * 48271u) % 2147483647u);
+  assert(*seed > 0);
  return *seed;
 }

@@ -641,57 +649,75 @@ static int HistoQueueInit(HistoQueue* const histo_queue, const int max_index) {
 static void HistoQueueClear(HistoQueue* const histo_queue) {
  assert(histo_queue != NULL);
  WebPSafeFree(histo_queue->queue);
+  histo_queue->size = 0;
+  histo_queue->max_size = 0;
 }

-static void SwapHistogramPairs(HistogramPair *p1,
-                               HistogramPair *p2) {
-  const HistogramPair tmp = *p1;
-  *p1 = *p2;
-  *p2 = tmp;
+// Pop a specific pair in the queue by replacing it with the last one
+// and shrinking the queue.
+static void HistoQueuePopPair(HistoQueue* const histo_queue,
+                              HistogramPair* const pair) {
+  assert(pair >= histo_queue->queue &&
+         pair < (histo_queue->queue + histo_queue->size));
+  assert(histo_queue->size > 0);
+  *pair = histo_queue->queue[histo_queue->size - 1];
+  --histo_queue->size;
 }

-// Given a valid priority queue in range [0, queue_size) this function checks
-// whether histo_queue[queue_size] should be accepted and swaps it with the
-// front if it is smaller. Otherwise, it leaves it as is.
-static void UpdateQueueFront(HistoQueue* const histo_queue) {
-  if (histo_queue->queue[histo_queue->size].cost_diff >= 0) return;
-
-  if (histo_queue->queue[histo_queue->size].cost_diff <
-      histo_queue->queue[0].cost_diff) {
-    SwapHistogramPairs(histo_queue->queue,
-                       histo_queue->queue + histo_queue->size);
+// Check whether a pair in the queue should be updated as head or not.
+static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
+                                 HistogramPair* const pair) {
+  assert(pair->cost_diff < 0.);
+  assert(pair >= histo_queue->queue &&
+         pair < (histo_queue->queue + histo_queue->size));
+  assert(histo_queue->size > 0);
+  if (pair->cost_diff < histo_queue->queue[0].cost_diff) {
+    // Replace the best pair.
+    const HistogramPair tmp = histo_queue->queue[0];
+    histo_queue->queue[0] = *pair;
+    *pair = tmp;
  }
-  ++histo_queue->size;
-
-  // We cannot add more elements than the capacity.
-  // The allocation adds an extra element to the official capacity so that
-  // histo_queue->queue[histo_queue->max_size] is read/written within bound.
-  assert(histo_queue->size <= histo_queue->max_size);
 }

-// -----------------------------------------------------------------------------
-
-static void PreparePair(VP8LHistogram** histograms, int idx1, int idx2,
-                        HistogramPair* const pair) {
-  VP8LHistogram* h1;
-  VP8LHistogram* h2;
+// Create a pair from indices "idx1" and "idx2" provided its cost
+// is inferior to "threshold", a negative entropy.
+// It returns the cost of the pair, or 0. if it superior to threshold.
+static double HistoQueuePush(HistoQueue* const histo_queue,
+                             VP8LHistogram** const histograms, int idx1,
+                             int idx2, double threshold) {
+  const VP8LHistogram* h1;
+  const VP8LHistogram* h2;
+  HistogramPair pair;
  double sum_cost;

+  assert(threshold <= 0.);
  if (idx1 > idx2) {
    const int tmp = idx2;
    idx2 = idx1;
    idx1 = tmp;
  }
-  pair->idx1 = idx1;
-  pair->idx2 = idx2;
+  pair.idx1 = idx1;
+  pair.idx2 = idx2;
  h1 = histograms[idx1];
  h2 = histograms[idx2];
  sum_cost = h1->bit_cost_ + h2->bit_cost_;
-  pair->cost_combo = 0.;
-  GetCombinedHistogramEntropy(h1, h2, sum_cost, &pair->cost_combo);
-  pair->cost_diff = pair->cost_combo - sum_cost;
+  pair.cost_combo = 0.;
+  GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair.cost_combo);
+  pair.cost_diff = pair.cost_combo - sum_cost;
+
+  // Do not even consider the pair if it does not improve the entropy.
+  if (pair.cost_diff >= threshold) return 0.;
+
+  // We cannot add more elements than the capacity.
+  assert(histo_queue->size < histo_queue->max_size);
+  histo_queue->queue[histo_queue->size++] = pair;
+  HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]);
+
+  return pair.cost_diff;
 }

+// -----------------------------------------------------------------------------
+
 // Combines histograms by continuously choosing the one with the highest cost
 // reduction.
 static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
@@ -714,13 +740,11 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
    clusters[i] = i;
    for (j = i + 1; j < image_histo_size; ++j) {
      // Initialize positions array.
-      PreparePair(histograms, i, j, &histo_queue.queue[histo_queue.size]);
-      UpdateQueueFront(&histo_queue);
+      HistoQueuePush(&histo_queue, histograms, i, j, 0.);
    }
  }

  while (image_histo_size > 1 && histo_queue.size > 0) {
-    HistogramPair* copy_to;
    const int idx1 = histo_queue.queue[0].idx1;
    const int idx2 = histo_queue.queue[0].idx2;
    HistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
@@ -733,31 +757,22 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
    }
    --image_histo_size;

-    // Remove pairs intersecting the just combined best pair. This will
-    // therefore pop the head of the queue.
-    copy_to = histo_queue.queue;
-    for (i = 0; i < histo_queue.size; ++i) {
+    // Remove pairs intersecting the just combined best pair.
+    for (i = 0; i < histo_queue.size;) {
      HistogramPair* const p = histo_queue.queue + i;
      if (p->idx1 == idx1 || p->idx2 == idx1 ||
          p->idx1 == idx2 || p->idx2 == idx2) {
-        // Do not copy the invalid pair.
-        continue;
+        HistoQueuePopPair(&histo_queue, p);
+      } else {
+        HistoQueueUpdateHead(&histo_queue, p);
+        ++i;
      }
-      if (p->cost_diff < histo_queue.queue[0].cost_diff) {
-        // Replace the top of the queue if we found better.
-        SwapHistogramPairs(histo_queue.queue, p);
-      }
-      SwapHistogramPairs(copy_to, p);
-      ++copy_to;
    }
-    histo_queue.size = (int)(copy_to - histo_queue.queue);

    // Push new pairs formed with combined histogram to the queue.
    for (i = 0; i < image_histo_size; ++i) {
      if (clusters[i] != idx1) {
-        PreparePair(histograms, idx1, clusters[i],
-                    &histo_queue.queue[histo_queue.size]);
-        UpdateQueueFront(&histo_queue);
+        HistoQueuePush(&histo_queue, histograms, idx1, clusters[i], 0.);
      }
    }
  }
@@ -777,90 +792,130 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
  return ok;
 }

-static void HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
-                                       VP8LHistogram* tmp_histo,
-                                       VP8LHistogram* best_combo,
-                                       int quality, int min_cluster_size) {
+// Perform histogram aggregation using a stochastic approach.
+// 'do_greedy' is set to 1 if a greedy approach needs to be performed
+// afterwards, 0 otherwise.
+static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
+                                      int min_cluster_size,
+                                      int* const do_greedy) {
  int iter;
-  uint32_t seed = 0;
+  uint32_t seed = 1;
  int tries_with_no_success = 0;
  int image_histo_size = image_histo->size;
-  const int iter_mult = (quality < 25) ? 2 : 2 + (quality - 25) / 8;
-  const int outer_iters = image_histo_size * iter_mult;
-  const int num_pairs = image_histo_size / 2;
+  const int outer_iters = image_histo_size;
  const int num_tries_no_success = outer_iters / 2;
-  int idx2_max = image_histo_size - 1;
-  int do_brute_dorce = 0;
  VP8LHistogram** const histograms = image_histo->histograms;
+  // Priority queue of histogram pairs. Its size of "kCostHeapSizeSqrt"^2
+  // impacts the quality of the compression and the speed: the smaller the
+  // faster but the worse for the compression.
+  HistoQueue histo_queue;
+  const int kHistoQueueSizeSqrt = 3;
+  int ok = 0;

+  if (!HistoQueueInit(&histo_queue, kHistoQueueSizeSqrt)) {
+    goto End;
+  }
  // Collapse similar histograms in 'image_histo'.
  ++min_cluster_size;
-  for (iter = 0;
-       iter < outer_iters && image_histo_size >= min_cluster_size;
+  for (iter = 0; iter < outer_iters && image_histo_size >= min_cluster_size &&
+                 ++tries_with_no_success < num_tries_no_success;
       ++iter) {
-    double best_cost_diff = 0.;
+    double best_cost =
+        (histo_queue.size == 0) ? 0. : histo_queue.queue[0].cost_diff;
    int best_idx1 = -1, best_idx2 = 1;
    int j;
-    int num_tries =
-        (num_pairs < image_histo_size) ? num_pairs : image_histo_size;
-    // Use a brute force approach if:
-    // - stochastic has not worked for a while and
-    // - if the number of iterations for brute force is less than the number of
-    // iterations if we never find a match ever again stochastically (hence
-    // num_tries times the number of remaining outer iterations).
-    do_brute_dorce =
-        (tries_with_no_success > 10) &&
-        (idx2_max * (idx2_max + 1) < 2 * num_tries * (outer_iters - iter));
-    if (do_brute_dorce) num_tries = idx2_max;
+    const uint32_t rand_range = (image_histo_size - 1) * image_histo_size;
+    // image_histo_size / 2 was chosen empirically. Less means faster but worse
+    // compression.
+    const int num_tries = image_histo_size / 2;

-    seed += iter;
    for (j = 0; j < num_tries; ++j) {
-      double curr_cost_diff;
-      // Choose two histograms at random and try to combine them.
-      uint32_t idx1, idx2;
-      if (do_brute_dorce) {
-        // Use a brute force approach.
-        idx1 = (uint32_t)j;
-        idx2 = (uint32_t)idx2_max;
-      } else {
-        const uint32_t tmp = (j & 7) + 1;
-        const uint32_t diff =
-            (tmp < 3) ? tmp : MyRand(&seed) % (image_histo_size - 1);
-        idx1 = MyRand(&seed) % image_histo_size;
-        idx2 = (idx1 + diff + 1) % image_histo_size;
-        if (idx1 == idx2) {
+      double curr_cost;
+      // Choose two different histograms at random and try to combine them.
+      const uint32_t tmp = MyRand(&seed) % rand_range;
+      const uint32_t idx1 = tmp / (image_histo_size - 1);
+      uint32_t idx2 = tmp % (image_histo_size - 1);
+      if (idx2 >= idx1) ++idx2;
+
+      // Calculate cost reduction on combination.
+      curr_cost =
+          HistoQueuePush(&histo_queue, histograms, idx1, idx2, best_cost);
+      if (curr_cost < 0) {  // found a better pair?
+        best_cost = curr_cost;
+        // Empty the queue if we reached full capacity.
+        if (histo_queue.size == histo_queue.max_size) break;
+      }
+    }
+    if (histo_queue.size == 0) continue;
+
+    // Merge the two best histograms.
+    best_idx1 = histo_queue.queue[0].idx1;
+    best_idx2 = histo_queue.queue[0].idx2;
+    assert(best_idx1 < best_idx2);
+    HistogramAddEval(histograms[best_idx1], histograms[best_idx2],
+                     histograms[best_idx1], 0);
+    // Swap the best_idx2 histogram with the last one (which is now unused).
+    --image_histo_size;
+    if (best_idx2 != image_histo_size) {
+      HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
+    }
+    histograms[image_histo_size] = NULL;
+    // Parse the queue and update each pair that deals with best_idx1,
+    // best_idx2 or image_histo_size.
+    for (j = 0; j < histo_queue.size;) {
+      HistogramPair* const p = histo_queue.queue + j;
+      const int is_idx1_best = p->idx1 == best_idx1 || p->idx1 == best_idx2;
+      const int is_idx2_best = p->idx2 == best_idx1 || p->idx2 == best_idx2;
+      int do_eval = 0;
+      // The front pair could have been duplicated by a random pick so
+      // check for it all the time nevertheless.
+      if (is_idx1_best && is_idx2_best) {
+        HistoQueuePopPair(&histo_queue, p);
+        continue;
+      }
+      // Any pair containing one of the two best indices should only refer to
+      // best_idx1. Its cost should also be updated.
+      if (is_idx1_best) {
+        p->idx1 = best_idx1;
+        do_eval = 1;
+      } else if (is_idx2_best) {
+        p->idx2 = best_idx1;
+        do_eval = 1;
+      }
+      if (p->idx2 == image_histo_size) {
+        // No need to re-evaluate here as it does not involve a pair
+        // containing best_idx1 or best_idx2.
+        p->idx2 = best_idx2;
+      }
+      assert(p->idx2 < image_histo_size);
+      // Make sure the index order is respected.
+      if (p->idx1 > p->idx2) {
+        const int tmp = p->idx2;
+        p->idx2 = p->idx1;
+        p->idx1 = tmp;
+      }
+      if (do_eval) {
+        // Re-evaluate the cost of an updated pair.
+        GetCombinedHistogramEntropy(histograms[p->idx1], histograms[p->idx2], 0,
+                                    &p->cost_diff);
+        if (p->cost_diff >= 0.) {
+          HistoQueuePopPair(&histo_queue, p);
          continue;
        }
      }
+      HistoQueueUpdateHead(&histo_queue, p);
+      ++j;
+    }

-      // Calculate cost reduction on combining.
-      curr_cost_diff = HistogramAddEval(histograms[idx1], histograms[idx2],
-                                        tmp_histo, best_cost_diff);
-      if (curr_cost_diff < best_cost_diff) {  // found a better pair?
-        HistogramSwap(&best_combo, &tmp_histo);
-        best_cost_diff = curr_cost_diff;
-        best_idx1 = idx1;
-        best_idx2 = idx2;
-      }
-    }
-    if (do_brute_dorce) --idx2_max;
-
-    if (best_idx1 >= 0) {
-      HistogramSwap(&best_combo, &histograms[best_idx1]);
-      // swap best_idx2 slot with last one (which is now unused)
-      --image_histo_size;
-      if (idx2_max >= image_histo_size) idx2_max = image_histo_size - 1;
-      if (best_idx2 != image_histo_size) {
-        HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
-        histograms[image_histo_size] = NULL;
-      }
-      tries_with_no_success = 0;
-    }
-    if (++tries_with_no_success >= num_tries_no_success || idx2_max == 0) {
-      break;
-    }
+    tries_with_no_success = 0;
  }
  image_histo->size = image_histo_size;
+  *do_greedy = (image_histo->size <= min_cluster_size);
+  ok = 1;
+
+End:
+  HistoQueueClear(&histo_queue);
+  return ok;
 }

 // -----------------------------------------------------------------------------
@@ -925,7 +980,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                             int quality, int low_effort,
                             int histo_bits, int cache_bits,
                             VP8LHistogramSet* const image_histo,
-                             VP8LHistogramSet* const tmp_histos,
+                             VP8LHistogram* const tmp_histo,
                             uint16_t* const histogram_symbols) {
  int ok = 0;
  const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
@@ -933,7 +988,6 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
  const int image_histo_raw_size = histo_xsize * histo_ysize;
  VP8LHistogramSet* const orig_histo =
      VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
-  VP8LHistogram* cur_combo;
  // Don't attempt linear bin-partition heuristic for
  // histograms of small sizes (as bin_map will be very sparse) and
  // maximum quality q==100 (to preserve the compression gains at that level).
@@ -948,7 +1002,6 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
  // Copies the histograms and computes its bit_cost.
  HistogramCopyAndAnalyze(orig_histo, image_histo);

-  cur_combo = tmp_histos->histograms[1];  // pick up working slot
  if (entropy_combine) {
    const int bin_map_size = orig_histo->size;
    // Reuse histogram_symbols storage. By definition, it's guaranteed to be ok.
@@ -958,10 +1011,9 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,

    HistogramAnalyzeEntropyBin(orig_histo, bin_map, low_effort);
    // Collapse histograms with similar entropy.
-    cur_combo = HistogramCombineEntropyBin(image_histo, cur_combo,
-                                           bin_map, bin_map_size,
-                                           entropy_combine_num_bins,
-                                           combine_cost_factor, low_effort);
+    HistogramCombineEntropyBin(image_histo, tmp_histo, bin_map, bin_map_size,
+                               entropy_combine_num_bins, combine_cost_factor,
+                               low_effort);
  }

  // Don't combine the histograms using stochastic and greedy heuristics for
@@ -970,10 +1022,11 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
    const float x = quality / 100.f;
    // cubic ramp between 1 and MAX_HISTO_GREEDY:
    const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1));
-    HistogramCombineStochastic(image_histo, tmp_histos->histograms[0],
-                               cur_combo, quality, threshold_size);
-    if ((image_histo->size <= threshold_size) &&
-        !HistogramCombineGreedy(image_histo)) {
+    int do_greedy;
+    if (!HistogramCombineStochastic(image_histo, threshold_size, &do_greedy)) {
+      goto Error;
+    }
+    if (do_greedy && !HistogramCombineGreedy(image_histo)) {
      goto Error;
    }
  }
--- a/src/enc/histogram_enc.h
+++ b/src/enc/histogram_enc.h
@@ -90,7 +90,9 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits);

 // Accumulate a token 'v' into a histogram.
 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
-                                     const PixOrCopy* const v);
+                                     const PixOrCopy* const v,
+                                     int (*const distance_modifier)(int, int),
+                                     int distance_modifier_arg0);

 static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) {
  return NUM_LITERAL_CODES + NUM_LENGTH_CODES +
@@ -103,7 +105,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                             int quality, int low_effort,
                             int histogram_bits, int cache_bits,
                             VP8LHistogramSet* const image_in,
-                             VP8LHistogramSet* const tmp_histos,
+                             VP8LHistogram* const tmp_histo,
                             uint16_t* const histogram_symbols);

 // Returns the entropy for the symbols in the input array.
--- a/src/enc/near_lossless_enc.c
+++ b/src/enc/near_lossless_enc.c
@@ -26,9 +26,9 @@

 // Quantizes the value up or down to a multiple of 1<<bits (or to 255),
 // choosing the closer one, resolving ties using bankers' rounding.
-static int FindClosestDiscretized(int a, int bits) {
-  const int mask = (1 << bits) - 1;
-  const int biased = a + (mask >> 1) + ((a >> bits) & 1);
+static uint32_t FindClosestDiscretized(uint32_t a, int bits) {
+  const uint32_t mask = (1u << bits) - 1;
+  const uint32_t biased = a + (mask >> 1) + ((a >> bits) & 1);
  assert(bits > 0);
  if (biased > 0xff) return 0xff;
  return biased & ~mask;
@@ -69,22 +69,30 @@ static int IsSmooth(const uint32_t* const prev_row,
 }

 // Adjusts pixel values of image with given maximum error.
-static void NearLossless(int xsize, int ysize, uint32_t* argb,
-                         int limit_bits, uint32_t* copy_buffer) {
+static void NearLossless(int xsize, int ysize, const uint32_t* argb_src,
+                         int stride, int limit_bits, uint32_t* copy_buffer,
+                         uint32_t* argb_dst) {
  int x, y;
  const int limit = 1 << limit_bits;
  uint32_t* prev_row = copy_buffer;
  uint32_t* curr_row = prev_row + xsize;
  uint32_t* next_row = curr_row + xsize;
-  memcpy(copy_buffer, argb, xsize * 2 * sizeof(argb[0]));
+  memcpy(curr_row, argb_src, xsize * sizeof(argb_src[0]));
+  memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));

-  for (y = 1; y < ysize - 1; ++y) {
-    uint32_t* const curr_argb_row = argb + y * xsize;
-    uint32_t* const next_argb_row = curr_argb_row + xsize;
-    memcpy(next_row, next_argb_row, xsize * sizeof(argb[0]));
-    for (x = 1; x < xsize - 1; ++x) {
-      if (!IsSmooth(prev_row, curr_row, next_row, x, limit)) {
-        curr_argb_row[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
+  for (y = 0; y < ysize; ++y, argb_src += stride, argb_dst += xsize) {
+    if (y == 0 || y == ysize - 1) {
+      memcpy(argb_dst, argb_src, xsize * sizeof(argb_src[0]));
+    } else {
+      memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
+      argb_dst[0] = argb_src[0];
+      argb_dst[xsize - 1] = argb_src[xsize - 1];
+      for (x = 1; x < xsize - 1; ++x) {
+        if (IsSmooth(prev_row, curr_row, next_row, x, limit)) {
+          argb_dst[x] = curr_row[x];
+        } else {
+          argb_dst[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
+        }
      }
    }
    {
@@ -97,25 +105,37 @@ static void NearLossless(int xsize, int ysize, uint32_t* argb,
  }
 }

-int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality) {
+int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
+                         uint32_t* const argb_dst) {
  int i;
+  const int xsize = picture->width;
+  const int ysize = picture->height;
+  const int stride = picture->argb_stride;
  uint32_t* const copy_buffer =
      (uint32_t*)WebPSafeMalloc(xsize * 3, sizeof(*copy_buffer));
  const int limit_bits = VP8LNearLosslessBits(quality);
-  assert(argb != NULL);
-  assert(limit_bits >= 0);
+  assert(argb_dst != NULL);
+  assert(limit_bits > 0);
  assert(limit_bits <= MAX_LIMIT_BITS);
  if (copy_buffer == NULL) {
    return 0;
  }
  // For small icon images, don't attempt to apply near-lossless compression.
-  if (xsize < MIN_DIM_FOR_NEAR_LOSSLESS && ysize < MIN_DIM_FOR_NEAR_LOSSLESS) {
+  if ((xsize < MIN_DIM_FOR_NEAR_LOSSLESS &&
+       ysize < MIN_DIM_FOR_NEAR_LOSSLESS) ||
+      ysize < 3) {
+    for (i = 0; i < ysize; ++i) {
+      memcpy(argb_dst + i * xsize, picture->argb + i * picture->argb_stride,
+             xsize * sizeof(*argb_dst));
+    }
    WebPSafeFree(copy_buffer);
    return 1;
  }

-  for (i = limit_bits; i != 0; --i) {
-    NearLossless(xsize, ysize, argb, i, copy_buffer);
+  NearLossless(xsize, ysize, picture->argb, stride, limit_bits, copy_buffer,
+               argb_dst);
+  for (i = limit_bits - 1; i != 0; --i) {
+    NearLossless(xsize, ysize, argb_dst, xsize, i, copy_buffer, argb_dst);
  }
  WebPSafeFree(copy_buffer);
  return 1;
--- a/src/enc/picture_csp_enc.c
+++ b/src/enc/picture_csp_enc.c
@@ -171,7 +171,7 @@ typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
 #if defined(USE_GAMMA_COMPRESSION)

 // float variant of gamma-correction
-// We use tables of different size and precision for the Rec709
+// We use tables of different size and precision for the Rec709 / BT2020
 // transfer function.
 #define kGammaF (1./0.45)
 static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
@@ -183,8 +183,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
    int v;
    const double norm = 1. / MAX_Y_T;
    const double scale = 1. / kGammaTabSize;
-    const double a = 0.099;
-    const double thresh = 0.018;
+    const double a = 0.09929682680944;
+    const double thresh = 0.018053968510807;
    for (v = 0; v <= MAX_Y_T; ++v) {
      const double g = norm * v;
      if (g <= thresh * 4.5) {
@@ -1105,9 +1105,14 @@ static int Import(WebPPicture* const picture,

  if (import_alpha) {
    uint32_t* dst = picture->argb;
+    const int do_copy = !swap_rb && !ALPHA_IS_LAST;
    assert(step == 4);
    for (y = 0; y < height; ++y) {
-      VP8PackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
+      if (do_copy) {
+        memcpy(dst, r_ptr, width * sizeof(*dst));
+      } else {
+        VP8PackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
+      }
      a_ptr += rgb_stride;
      r_ptr += rgb_stride;
      g_ptr += rgb_stride;
--- a/src/enc/predictor_enc.c
+++ b/src/enc/predictor_enc.c
@@ -180,6 +180,7 @@ static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
 // max_quantization which is a power of 2, smaller than max_diff). Take care if
 // value and predict have undergone subtract green, which means that red and
 // blue are represented as offsets from green.
+#define NEAR_LOSSLESS_DIFF(a, b) (uint8_t)((((int)(a) - (int)(b))) & 0xff)
 static uint32_t NearLossless(uint32_t value, uint32_t predict,
                             int max_quantization, int max_diff,
                             int used_subtract_green) {
@@ -196,7 +197,7 @@ static uint32_t NearLossless(uint32_t value, uint32_t predict,
  }
  if ((value >> 24) == 0 || (value >> 24) == 0xff) {
    // Preserve transparency of fully transparent or fully opaque pixels.
-    a = ((value >> 24) - (predict >> 24)) & 0xff;
+    a = NEAR_LOSSLESS_DIFF(value >> 24, predict >> 24);
  } else {
    a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
  }
@@ -209,15 +210,16 @@ static uint32_t NearLossless(uint32_t value, uint32_t predict,
    // The amount by which green has been adjusted during quantization. It is
    // subtracted from red and blue for compensation, to avoid accumulating two
    // quantization errors in them.
-    green_diff = (new_green - (value >> 8)) & 0xff;
+    green_diff = NEAR_LOSSLESS_DIFF(new_green, value >> 8);
  }
-  r = NearLosslessComponent(((value >> 16) - green_diff) & 0xff,
+  r = NearLosslessComponent(NEAR_LOSSLESS_DIFF(value >> 16, green_diff),
                            (predict >> 16) & 0xff, 0xff - new_green,
                            quantization);
-  b = NearLosslessComponent((value - green_diff) & 0xff, predict & 0xff,
-                            0xff - new_green, quantization);
+  b = NearLosslessComponent(NEAR_LOSSLESS_DIFF(value, green_diff),
+                            predict & 0xff, 0xff - new_green, quantization);
  return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
 }
+#undef NEAR_LOSSLESS_DIFF

 // Stores the difference between the pixel and its prediction in "out".
 // In case of a lossy encoding, updates the source image to avoid propagating
--- a/src/enc/vp8i_enc.h
+++ b/src/enc/vp8i_enc.h
@@ -504,7 +504,8 @@ void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);

  // in near_lossless.c
 // Near lossless preprocessing in RGB color-space.
-int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality);
+int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
+                         uint32_t* const argb_dst);
 // Near lossless adjustment for predictors.
 void VP8ApplyNearLosslessPredict(int xsize, int ysize, int pred_bits,
                                 const uint32_t* argb_orig,
--- a/src/enc/vp8l_enc.c
+++ b/src/enc/vp8l_enc.c
--- a/src/enc/vp8li_enc.h
+++ b/src/enc/vp8li_enc.h
@@ -27,16 +27,24 @@ extern "C" {
 // maximum value of transform_bits_ in VP8LEncoder.
 #define MAX_TRANSFORM_BITS 6

+typedef enum {
+  kEncoderNone = 0,
+  kEncoderARGB,
+  kEncoderNearLossless,
+  kEncoderPalette
+} VP8LEncoderARGBContent;
+
 typedef struct {
  const WebPConfig* config_;      // user configuration and parameters
  const WebPPicture* pic_;        // input picture.

-  uint32_t* argb_;                // Transformed argb image data.
-  uint32_t* argb_scratch_;        // Scratch memory for argb rows
-                                  // (used for prediction).
-  uint32_t* transform_data_;      // Scratch memory for transform data.
-  uint32_t* transform_mem_;       // Currently allocated memory.
-  size_t    transform_mem_size_;  // Currently allocated memory size.
+  uint32_t* argb_;                       // Transformed argb image data.
+  VP8LEncoderARGBContent argb_content_;  // Content type of the argb buffer.
+  uint32_t* argb_scratch_;               // Scratch memory for argb rows
+                                         // (used for prediction).
+  uint32_t* transform_data_;             // Scratch memory for transform data.
+  uint32_t* transform_mem_;              // Currently allocated memory.
+  size_t    transform_mem_size_;         // Currently allocated memory size.

  int       current_width_;       // Corresponds to packed image width.

@@ -54,8 +62,7 @@ typedef struct {
  uint32_t palette_[MAX_PALETTE_SIZE];

  // Some 'scratch' (potentially large) objects.
-  struct VP8LBackwardRefs refs_[2];  // Backward Refs array corresponding to
-                                     // LZ77 & RLE coding.
+  struct VP8LBackwardRefs refs_[3];  // Backward Refs array for temporaries.
  VP8LHashChain hash_chain_;         // HashChain data for constructing
                                     // backward references.
 } VP8LEncoder;
--- a/src/mux/anim_encode.c
+++ b/src/mux/anim_encode.c
@@ -35,7 +35,7 @@
 // Stores frame rectangle dimensions.
 typedef struct {
  int x_offset_, y_offset_, width_, height_;
-} FrameRect;
+} FrameRectangle;

 // Used to store two candidates of encoded data for an animation frame. One of
 // the two will be chosen later.
@@ -50,7 +50,7 @@ struct WebPAnimEncoder {
  const int canvas_height_;                 // Canvas height.
  const WebPAnimEncoderOptions options_;    // Global encoding options.

-  FrameRect prev_rect_;               // Previous WebP frame rectangle.
+  FrameRectangle prev_rect_;          // Previous WebP frame rectangle.
  WebPConfig last_config_;            // Cached in case a re-encode is needed.
  WebPConfig last_config_reversed_;   // If 'last_config_' uses lossless, then
                                      // this config uses lossy and vice versa;
@@ -206,7 +206,7 @@ static void ClearRectangle(WebPPicture* const picture,
 }

 static void WebPUtilClearPic(WebPPicture* const picture,
-                             const FrameRect* const rect) {
+                             const FrameRectangle* const rect) {
  if (rect != NULL) {
    ClearRectangle(picture, rect->x_offset_, rect->y_offset_,
                   rect->width_, rect->height_);
@@ -400,7 +400,7 @@ static WEBP_INLINE int ComparePixelsLossy(const uint32_t* src, int src_step,
  return 1;
 }

-static int IsEmptyRect(const FrameRect* const rect) {
+static int IsEmptyRect(const FrameRectangle* const rect) {
  return (rect->width_ == 0) || (rect->height_ == 0);
 }

@@ -413,7 +413,7 @@ static int QualityToMaxDiff(float quality) {
 // Assumes that an initial valid guess of change rectangle 'rect' is passed.
 static void MinimizeChangeRectangle(const WebPPicture* const src,
                                    const WebPPicture* const dst,
-                                    FrameRect* const rect,
+                                    FrameRectangle* const rect,
                                    int is_lossless, float quality) {
  int i, j;
  const ComparePixelsFunc compare_pixels =
@@ -498,7 +498,7 @@ static void MinimizeChangeRectangle(const WebPPicture* const src,
 }

 // Snap rectangle to even offsets (and adjust dimensions if needed).
-static WEBP_INLINE void SnapToEvenOffsets(FrameRect* const rect) {
+static WEBP_INLINE void SnapToEvenOffsets(FrameRectangle* const rect) {
  rect->width_ += (rect->x_offset_ & 1);
  rect->height_ += (rect->y_offset_ & 1);
  rect->x_offset_ &= ~1;
@@ -508,9 +508,9 @@ static WEBP_INLINE void SnapToEvenOffsets(FrameRect* const rect) {
 typedef struct {
  int should_try_;               // Should try this set of parameters.
  int empty_rect_allowed_;       // Frame with empty rectangle can be skipped.
-  FrameRect rect_ll_;            // Frame rectangle for lossless compression.
+  FrameRectangle rect_ll_;       // Frame rectangle for lossless compression.
  WebPPicture sub_frame_ll_;     // Sub-frame pic for lossless compression.
-  FrameRect rect_lossy_;         // Frame rectangle for lossy compression.
+  FrameRectangle rect_lossy_;    // Frame rectangle for lossy compression.
                                 // Could be smaller than rect_ll_ as pixels
                                 // with small diffs can be ignored.
  WebPPicture sub_frame_lossy_;  // Sub-frame pic for lossless compression.
@@ -538,7 +538,8 @@ static void SubFrameParamsFree(SubFrameParams* const params) {
 static int GetSubRect(const WebPPicture* const prev_canvas,
                      const WebPPicture* const curr_canvas, int is_key_frame,
                      int is_first_frame, int empty_rect_allowed,
-                      int is_lossless, float quality, FrameRect* const rect,
+                      int is_lossless, float quality,
+                      FrameRectangle* const rect,
                      WebPPicture* const sub_frame) {
  if (!is_key_frame || is_first_frame) {  // Optimize frame rectangle.
    // Note: This behaves as expected for first frame, as 'prev_canvas' is
@@ -594,7 +595,7 @@ int WebPAnimEncoderRefineRect(
    const WebPPicture* const prev_canvas, const WebPPicture* const curr_canvas,
    int is_lossless, float quality, int* const x_offset, int* const y_offset,
    int* const width, int* const height) {
-  FrameRect rect;
+  FrameRectangle rect;
  const int right = clip(*x_offset + *width, 0, curr_canvas->width);
  const int left = clip(*x_offset, 0, curr_canvas->width - 1);
  const int bottom = clip(*y_offset + *height, 0, curr_canvas->height);
@@ -620,7 +621,7 @@ int WebPAnimEncoderRefineRect(
 }

 static void DisposeFrameRectangle(int dispose_method,
-                                  const FrameRect* const rect,
+                                  const FrameRectangle* const rect,
                                  WebPPicture* const curr_canvas) {
  assert(rect != NULL);
  if (dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
@@ -628,13 +629,13 @@ static void DisposeFrameRectangle(int dispose_method,
  }
 }

-static uint32_t RectArea(const FrameRect* const rect) {
+static uint32_t RectArea(const FrameRectangle* const rect) {
  return (uint32_t)rect->width_ * rect->height_;
 }

 static int IsLosslessBlendingPossible(const WebPPicture* const src,
                                      const WebPPicture* const dst,
-                                      const FrameRect* const rect) {
+                                      const FrameRectangle* const rect) {
  int i, j;
  assert(src->width == dst->width && src->height == dst->height);
  assert(rect->x_offset_ + rect->width_ <= dst->width);
@@ -656,7 +657,7 @@ static int IsLosslessBlendingPossible(const WebPPicture* const src,

 static int IsLossyBlendingPossible(const WebPPicture* const src,
                                   const WebPPicture* const dst,
-                                   const FrameRect* const rect,
+                                   const FrameRectangle* const rect,
                                   float quality) {
  const int max_allowed_diff_lossy = QualityToMaxDiff(quality);
  int i, j;
@@ -683,7 +684,7 @@ static int IsLossyBlendingPossible(const WebPPicture* const src,
 // transparent pixels.
 // Returns true if at least one pixel gets modified.
 static int IncreaseTransparency(const WebPPicture* const src,
-                                const FrameRect* const rect,
+                                const FrameRectangle* const rect,
                                WebPPicture* const dst) {
  int i, j;
  int modified = 0;
@@ -709,7 +710,7 @@ static int IncreaseTransparency(const WebPPicture* const src,
 // Assumes lossy compression is being used.
 // Returns true if at least one pixel gets modified.
 static int FlattenSimilarBlocks(const WebPPicture* const src,
-                                const FrameRect* const rect,
+                                const FrameRectangle* const rect,
                                WebPPicture* const dst, float quality) {
  const int max_allowed_diff_lossy = QualityToMaxDiff(quality);
  int i, j;
@@ -778,13 +779,13 @@ static int EncodeFrame(const WebPConfig* const config, WebPPicture* const pic,
 typedef struct {
  WebPMemoryWriter  mem_;
  WebPMuxFrameInfo  info_;
-  FrameRect         rect_;
+  FrameRectangle    rect_;
  int               evaluate_;  // True if this candidate should be evaluated.
 } Candidate;

 // Generates a candidate encoded frame given a picture and metadata.
 static WebPEncodingError EncodeCandidate(WebPPicture* const sub_frame,
-                                         const FrameRect* const rect,
+                                         const FrameRectangle* const rect,
                                         const WebPConfig* const encoder_config,
                                         int use_blending,
                                         Candidate* const candidate) {
@@ -958,7 +959,7 @@ static int IncreasePreviousDuration(WebPAnimEncoder* const enc, int duration) {
  if (new_duration >= MAX_DURATION) {  // Special case.
    // Separate out previous frame from earlier merged frames to avoid overflow.
    // We add a 1x1 transparent frame for the previous frame, with blending on.
-    const FrameRect rect = { 0, 0, 1, 1 };
+    const FrameRectangle rect = { 0, 0, 1, 1 };
    const uint8_t lossless_1x1_bytes[] = {
      0x52, 0x49, 0x46, 0x46, 0x14, 0x00, 0x00, 0x00, 0x57, 0x45, 0x42, 0x50,
      0x56, 0x50, 0x38, 0x4c, 0x08, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00,
@@ -1223,7 +1224,7 @@ static int CacheFrame(WebPAnimEncoder* const enc,
      enc->prev_candidate_undecided_ = 0;
    } else {
      int64_t curr_delta;
-      FrameRect prev_rect_key, prev_rect_sub;
+      FrameRectangle prev_rect_key, prev_rect_sub;

      // Add this as a frame rectangle to enc.
      error_code = SetFrame(enc, config, 0, encoded_frame, &frame_skipped);
--- a/src/mux/muxinternal.c
+++ b/src/mux/muxinternal.c
@@ -504,6 +504,20 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
    if (!has_animation && (num_anim == 1 || num_frames > 0)) {
      return WEBP_MUX_INVALID_ARGUMENT;
    }
+    if (!has_animation) {
+      const WebPMuxImage* images = mux->images_;
+      // There can be only one image.
+      if (images == NULL || images->next_ != NULL) {
+        return WEBP_MUX_INVALID_ARGUMENT;
+      }
+      // Size must match.
+      if (mux->canvas_width_ > 0) {
+        if (images->width_ != mux->canvas_width_ ||
+            images->height_ != mux->canvas_height_) {
+          return WEBP_MUX_INVALID_ARGUMENT;
+        }
+      }
+    }
  }

  // Verify either VP8X chunk is present OR there is only one elem in
@@ -515,6 +529,7 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
  if (num_vp8x == 0 && num_images != 1) return WEBP_MUX_INVALID_ARGUMENT;

  // ALPHA_FLAG & alpha chunk(s) are consistent.
+  // Note: ALPHA_FLAG can be set when there is actually no Alpha data present.
  if (MuxHasAlpha(mux->images_)) {
    if (num_vp8x > 0) {
      // VP8X chunk is present, so it should contain ALPHA_FLAG.
@@ -525,8 +540,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
      if (err != WEBP_MUX_OK) return err;
      if (num_alpha > 0) return WEBP_MUX_INVALID_ARGUMENT;
    }
-  } else {  // Mux doesn't need alpha. So, ALPHA_FLAG should NOT be present.
-    if (flags & ALPHA_FLAG) return WEBP_MUX_INVALID_ARGUMENT;
  }

  return WEBP_MUX_OK;
--- a/src/mux/muxread.c
+++ b/src/mux/muxread.c
@@ -270,6 +270,9 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
    ChunkInit(&chunk);
  }

+  // Incomplete image.
+  if (wpi->is_partial_) goto Err;
+
  // Validate mux if complete.
  if (MuxValidate(mux) != WEBP_MUX_OK) goto Err;

--- a/src/utils/bit_reader_utils.h
+++ b/src/utils/bit_reader_utils.h
@@ -155,9 +155,10 @@ static WEBP_INLINE int VP8LIsEndOfStream(const VP8LBitReader* const br) {

 // For jumping over a number of bits in the bit stream when accessed with
 // VP8LPrefetchBits and VP8LFillBitWindow.
+// This function does *not* set br->eos_, since it's speed-critical.
+// Use with extreme care!
 static WEBP_INLINE void VP8LSetBitPos(VP8LBitReader* const br, int val) {
  br->bit_pos_ = val;
-  br->eos_ = VP8LIsEndOfStream(br);
 }

 // Advances the read buffer by 4 bytes to make room for reading next 32 bits.
--- a/src/utils/bit_writer_utils.c
+++ b/src/utils/bit_writer_utils.c
@@ -239,6 +239,18 @@ int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size) {
  return VP8LBitWriterResize(bw, expected_size);
 }

+int VP8LBitWriterClone(const VP8LBitWriter* const src,
+                       VP8LBitWriter* const dst) {
+  const size_t current_size = src->cur_ - src->buf_;
+  assert(src->cur_ >= src->buf_ && src->cur_ <= src->end_);
+  if (!VP8LBitWriterResize(dst, current_size)) return 0;
+  memcpy(dst->buf_, src->buf_, current_size);
+  dst->bits_ = src->bits_;
+  dst->used_ = src->used_;
+  dst->error_ = src->error_;
+  return 1;
+}
+
 void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) {
  if (bw != NULL) {
    WebPSafeFree(bw->buf_);
@@ -246,6 +258,21 @@ void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) {
  }
 }

+void VP8LBitWriterReset(const VP8LBitWriter* const bw_init,
+                        VP8LBitWriter* const bw) {
+  bw->bits_ = bw_init->bits_;
+  bw->used_ = bw_init->used_;
+  bw->cur_ = bw->buf_ + (bw_init->cur_ - bw_init->buf_);
+  assert(bw->cur_ <= bw->end_);
+  bw->error_ = bw_init->error_;
+}
+
+void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst) {
+  const VP8LBitWriter tmp = *src;
+  *src = *dst;
+  *dst = tmp;
+}
+
 void VP8LPutBitsFlushBits(VP8LBitWriter* const bw) {
  // If needed, make some room by flushing some bits out.
  if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
--- a/src/utils/bit_writer_utils.h
+++ b/src/utils/bit_writer_utils.h
@@ -100,16 +100,24 @@ typedef struct {
  int error_;
 } VP8LBitWriter;

-static WEBP_INLINE size_t VP8LBitWriterNumBytes(VP8LBitWriter* const bw) {
+static WEBP_INLINE size_t VP8LBitWriterNumBytes(const VP8LBitWriter* const bw) {
  return (bw->cur_ - bw->buf_) + ((bw->used_ + 7) >> 3);
 }

 // Returns false in case of memory allocation error.
 int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size);
+// Returns false in case of memory allocation error.
+int VP8LBitWriterClone(const VP8LBitWriter* const src,
+                       VP8LBitWriter* const dst);
 // Finalize the bitstream coding. Returns a pointer to the internal buffer.
 uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw);
 // Release any pending memory and zeroes the object.
 void VP8LBitWriterWipeOut(VP8LBitWriter* const bw);
+// Resets the cursor of the BitWriter bw to when it was like in bw_init.
+void VP8LBitWriterReset(const VP8LBitWriter* const bw_init,
+                        VP8LBitWriter* const bw);
+// Swaps the memory held by two BitWriters.
+void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst);

 // Internal function for VP8LPutBits flushing 32 bits from the written state.
 void VP8LPutBitsFlushBits(VP8LBitWriter* const bw);
--- a/src/utils/color_cache_utils.h
+++ b/src/utils/color_cache_utils.h
@@ -15,6 +15,8 @@
 #ifndef WEBP_UTILS_COLOR_CACHE_H_
 #define WEBP_UTILS_COLOR_CACHE_H_

+#include <assert.h>
+
 #include "../webp/types.h"

 #ifdef __cplusplus
@@ -30,7 +32,7 @@ typedef struct {

 static const uint64_t kHashMul = 0x1e35a7bdull;

-static WEBP_INLINE int HashPix(uint32_t argb, int shift) {
+static WEBP_INLINE int VP8LHashPix(uint32_t argb, int shift) {
  return (int)(((argb * kHashMul) & 0xffffffffu) >> shift);
 }

@@ -48,19 +50,19 @@ static WEBP_INLINE void VP8LColorCacheSet(const VP8LColorCache* const cc,

 static WEBP_INLINE void VP8LColorCacheInsert(const VP8LColorCache* const cc,
                                             uint32_t argb) {
-  const int key = HashPix(argb, cc->hash_shift_);
+  const int key = VP8LHashPix(argb, cc->hash_shift_);
  cc->colors_[key] = argb;
 }

 static WEBP_INLINE int VP8LColorCacheGetIndex(const VP8LColorCache* const cc,
                                              uint32_t argb) {
-  return HashPix(argb, cc->hash_shift_);
+  return VP8LHashPix(argb, cc->hash_shift_);
 }

 // Return the key if cc contains argb, and -1 otherwise.
 static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
                                              uint32_t argb) {
-  const int key = HashPix(argb, cc->hash_shift_);
+  const int key = VP8LHashPix(argb, cc->hash_shift_);
  return (cc->colors_[key] == argb) ? key : -1;
 }

--- a/src/utils/quant_levels_dec_utils.c
+++ b/src/utils/quant_levels_dec_utils.c
@@ -71,10 +71,11 @@ typedef struct {

 //------------------------------------------------------------------------------

-#define CLIP_MASK (int)(~0U << (8 + DFIX))
+#define CLIP_8b_MASK (int)(~0U << (8 + DFIX))
 static WEBP_INLINE uint8_t clip_8b(int v) {
-  return (!(v & CLIP_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u;
+  return (!(v & CLIP_8b_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u;
 }
+#undef CLIP_8b_MASK

 // vertical accumulation
 static void VFilter(SmoothParams* const p) {
--- a/src/utils/utils.c
+++ b/src/utils/utils.c
@@ -16,6 +16,7 @@
 #include "../webp/decode.h"
 #include "../webp/encode.h"
 #include "../webp/format_constants.h"  // for MAX_PALETTE_SIZE
+#include "./color_cache_utils.h"
 #include "./utils.h"

 // If PRINT_MEM_INFO is defined, extra info (like total memory used, number of
@@ -252,7 +253,6 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
  int num_colors = 0;
  uint8_t in_use[COLOR_HASH_SIZE] = { 0 };
  uint32_t colors[COLOR_HASH_SIZE];
-  static const uint64_t kHashMul = 0x1e35a7bdull;
  const uint32_t* argb = pic->argb;
  const int width = pic->width;
  const int height = pic->height;
@@ -267,7 +267,7 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
        continue;
      }
      last_pix = argb[x];
-      key = ((last_pix * kHashMul) & 0xffffffffu) >> COLOR_HASH_RIGHT_SHIFT;
+      key = VP8LHashPix(last_pix, COLOR_HASH_RIGHT_SHIFT);
      while (1) {
        if (!in_use[key]) {
          colors[key] = last_pix;
--- a/src/utils/utils.h
+++ b/src/utils/utils.h
@@ -66,7 +66,7 @@ WEBP_EXTERN(void) WebPSafeFree(void* const ptr);
 // memcpy() is the safe way of moving potentially unaligned 32b memory.
 static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
  uint32_t A;
-  memcpy(&A, (const int*)ptr, sizeof(A));
+  memcpy(&A, ptr, sizeof(A));
  return A;
 }
 static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) {
@@ -112,12 +112,12 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
 #define WEBP_NEED_LOG_TABLE_8BIT
 extern const uint8_t WebPLogTable8bit[256];
 static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
-  int log = 0;
+  int log_value = 0;
  while (n >= 256) {
-    log += 8;
+    log_value += 8;
    n >>= 8;
  }
-  return log + WebPLogTable8bit[n];
+  return log_value + WebPLogTable8bit[n];
 }

 // Returns (int)floor(log2(n)). n must be > 0.
--- a/src/webp/encode.h
+++ b/src/webp/encode.h
@@ -93,7 +93,11 @@ typedef enum WebPImageHint {
 // Compression parameters.
 struct WebPConfig {
  int lossless;           // Lossless encoding (0=lossy(default), 1=lossless).
-  float quality;          // between 0 (smallest file) and 100 (biggest)
+  float quality;          // between 0 and 100. For lossy, 0 gives the smallest
+                          // size and 100 the largest. For lossless, this
+                          // parameter is the amount of effort put into the
+                          // compression: 0 is the fastest but gives larger
+                          // files compared to the slowest, but best, 100.
  int method;             // quality/speed trade-off (0=fast, 6=slower-better)

  WebPImageHint image_hint;  // Hint for image type (lossless only for now).
--- a/webp_js/index.html
+++ b/webp_js/index.html
@@ -0,0 +1,72 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+  <meta charset="UTF-8">
+  <title>simple Javascript WebP decoding demo</title>
+  <script type="text/javascript">
+    var Module = {
+      noInitialRun : true
+    };
+  </script>
+  <script type="text/javascript" src="./webp.js"></script>
+  <script type="text/javascript">
+
+// main wrapper for the function decoding a WebP into a canvas object
+var WebpToCanvas;
+
+function init() {
+  WebpToCanvas = Module.cwrap('WebpToSDL', 'number', ['array', 'number']);
+}
+
+function decode(webp_data, canvas_id) {
+  // get the canvas to decode into
+  var canvas = document.getElementById(canvas_id);
+  if (canvas == null) return;
+  // clear previous picture (if any)
+  Module.canvas = canvas;
+  canvas.getContext('2d').clearRect(0, 0, canvas.width, canvas.height);
+  // decode and measure timing
+  start = new Date();
+  var ret = WebpToCanvas(webp_data, webp_data.length);
+  end = new Date();
+  speed_result = document.getElementById('timing');
+  // display timing result
+  if (speed_result != null) {
+    var decode_time = end - start;
+    speed_result.innerHTML = '<p>decoding time: ' + decode_time +' ms.</p>';
+  }
+}
+
+function loadfile(filename, canvas_id) {
+  var xhr = new XMLHttpRequest();
+  xhr.open('GET', filename);
+  xhr.responseType = 'arraybuffer';
+  xhr.onreadystatechange = function() {
+    if (xhr.readyState == 4 && xhr.status == 200) {
+      var webp_data = new Uint8Array(xhr.response);
+      decode(webp_data, canvas_id);
+    }
+  };
+  xhr.send();
+}
+  </script>
+</head>
+
+<body onload='init()'>
+  <p>
+    <strong>WebP in JavaScript demo</strong> -
+  </p>
+  <p>
+    WebP decoder in JavaScript, using libwebp compiled with
+    <a href="https://github.com/kripken/emscripten/wiki">Emscripten</a>.
+  </p>
+  <p id="image_buttons">
+    <input type="button" value="test image!" name="./test_webp_js.webp"
+           onclick="loadfile(this.name, 'output_canvas')">
+  </p>
+  <p id="timing">Timing: N/A</p>
+  <canvas id="output_canvas">Your browser does not support canvas</canvas>
+
+</body>
+</html>
--- a/webp_js/index_wasm.html
+++ b/webp_js/index_wasm.html
@@ -0,0 +1,84 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+  <meta charset="UTF-8">
+  <title>simple Javascript WebP decoding demo, using Web-Assembly (WASM)</title>
+  <script type="text/javascript">
+    var Module = {
+      noInitialRun : true
+    };
+  </script>
+  <script type="text/javascript">
+
+function init() {
+  var xhr = new XMLHttpRequest();
+  xhr.open('GET', 'webp_wasm.wasm', true);
+  xhr.responseType = 'arraybuffer';
+  xhr.onload = function() {
+    Module.wasmBinary = xhr.response;
+    var script = document.createElement('script');
+    script.src = "webp_wasm.js";
+    document.body.appendChild(script);
+  };
+  xhr.send(null);
+}
+
+function decode(webp_data, canvas_id) {
+  var result;
+  if (Module["asm"] != undefined) {
+    // wrapper for the function decoding a WebP into a canvas object
+    WebpToCanvas = Module.cwrap('WebpToSDL', 'number', ['array', 'number']);
+    // get the canvas to decode into
+    var canvas = document.getElementById(canvas_id);
+    if (canvas == null) return;
+    // clear previous picture (if any)
+    Module.canvas = canvas;
+    canvas.getContext('2d').clearRect(0, 0, canvas.width, canvas.height);
+    // decode and measure timing
+    start = new Date();
+    var ret = WebpToCanvas(webp_data, webp_data.length);
+    end = new Date();
+    var decode_time = end - start;
+    result = 'decoding time: ' + decode_time +' ms.';
+  } else {
+    result = "WASM module not finished loading! Please retry";
+  }
+  // display timing result
+  speed_result = document.getElementById('timing');
+  if (speed_result != null) {
+    speed_result.innerHTML = '<p>'+ result + '</p>';
+  }
+}
+
+function loadfile(filename, canvas_id) {
+  var xhr = new XMLHttpRequest();
+  xhr.open('GET', filename);
+  xhr.responseType = 'arraybuffer';
+  xhr.onreadystatechange = function() {
+    if (xhr.readyState == 4 && xhr.status == 200) {
+      var webp_data = new Uint8Array(xhr.response);
+      decode(webp_data, canvas_id);
+    }
+  };
+  xhr.send();
+}
+  </script>
+</head>
+
+<body onload='init()'>
+  <p>
+    <strong>WebP demo using Web-Assembly</strong> -
+  </p>
+  <p>
+    WASM version of the WebP decoder, using libwebp compiled with
+    <a href="https://github.com/kripken/emscripten/wiki">Emscripten</a>.
+  </p>
+  <p id="image_buttons">
+    <input type="button" value="test image!"
+           onclick="loadfile('./test_webp_wasm.webp', 'output_canvas')">
+  </p>
+  <p id="timing">Timing: N/A</p>
+  <canvas id="output_canvas">Your browser does not support canvas</canvas>
+</body>
+</html>
--- a/webp_js/test_webp_js.webp
+++ b/webp_js/test_webp_js.webp
--- a/webp_js/test_webp_wasm.webp
+++ b/webp_js/test_webp_wasm.webp